OFApp/server/scrapers/turbo.js

import { writeFileSync, existsSync, readdirSync, statSync, createReadStream, unlinkSync } from 'fs';
import { join, basename } from 'path';
import { createHash } from 'crypto';
import { fsCreateSession, fsDestroySession, fsGet } from '../flaresolverr.js';
import { upsertMediaFile } from '../db.js';

// Match the duplicate scanner in gallery.js — md5 of first 64KB + exact size.
const HASH_BYTES = 65536;

function hashFirst64kSync(filePath) {
  return new Promise((resolve, reject) => {
    const hash = createHash('md5');
    const s = createReadStream(filePath, { start: 0, end: HASH_BYTES - 1 });
    s.on('data', (c) => hash.update(c));
    s.on('end', () => resolve(hash.digest('hex')));
    s.on('error', reject);
  });
}

// Build size -> [{filename, path, hash:null}] index for the folder. Hashes are
// computed lazily only when a size collision is found.
function buildSizeIndex(folderPath) {
  const idx = new Map();
  let entries;
  try { entries = readdirSync(folderPath); } catch { return idx; }
  for (const name of entries) {
    if (name.startsWith('.')) continue;
    const p = join(folderPath, name);
    try {
      const st = statSync(p);
      if (!st.isFile()) continue;
      if (!idx.has(st.size)) idx.set(st.size, []);
      idx.get(st.size).push({ filename: name, path: p, hash: null });
    } catch {}
  }
  return idx;
}

async function ensureCandidateHash(c) {
  if (c.hash != null) return c.hash;
  try { c.hash = await hashFirst64kSync(c.path); } catch { c.hash = ''; }
  return c.hash;
}

const TURBO_HOST_RE = /^https?:\/\/(?:www\.)?turbo\.\w+\//i;
const TURBO_BASE = 'https://turbo.cr';
const DEFAULT_UA = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36';

export function isTurboUrl(url) {
  return TURBO_HOST_RE.test(url);
}

function unescapeHtml(s) {
  return s.replace(/&amp;/g, '&').replace(/&lt;/g, '<').replace(/&gt;/g, '>').replace(/&quot;/g, '"');
}

function extractMp4FromHtml(html) {
  // Plyr renders the resolved URL into <video src="..."> after WASM runs
  const m = html.match(/<video[^>]+\bsrc=["']([^"']+\.mp4[^"']*)["']/i);
  if (m) return unescapeHtml(m[1]);
  // Fallback: any direct turbocdn mp4
  const m2 = html.match(/https?:\/\/[^"'\s<>]*turbocdn[^"'\s<>]*\.mp4[^"'\s<>]*/i);
  if (m2) return unescapeHtml(m2[0]);
  return null;
}

function turboFilename(mp4Url, fallbackId) {
  try {
    const u = new URL(mp4Url);
    const fn = u.searchParams.get('fn');
    if (fn) return fn;
    const base = basename(u.pathname);
    if (base) return base;
  } catch {}
  return (fallbackId || 'turbo') + '.mp4';
}

// Stream-download a video. After the first 64KB has been received, compute its
// md5 and check it against existing files of identical Content-Length in
// sizeIndex. If a match is found, the stream is aborted and no file is written
// — that's the same dedup signature the gallery scanner uses, so this prevents
// re-downloading content the user has already kept (or deleted by hand).
async function downloadVideo(url, dest, ua, sizeIndex) {
  try {
    const r = await fetch(url, {
      headers: { 'User-Agent': ua, 'Referer': TURBO_BASE + '/' },
      signal: AbortSignal.timeout(600000),
    });
    if (!r.ok) return { ok: false, status: r.status };

    const totalSize = parseInt(r.headers.get('content-length') || '0', 10);
    const reader = r.body.getReader();
    const chunks = [];
    let totalRead = 0;
    let firstHash = null;

    while (true) {
      const { done, value } = await reader.read();
      if (done) break;
      chunks.push(value);
      totalRead += value.length;

      if (firstHash === null && totalRead >= HASH_BYTES) {
        const head = Buffer.concat(chunks).subarray(0, HASH_BYTES);
        firstHash = createHash('md5').update(head).digest('hex');

        if (totalSize > 0 && sizeIndex.has(totalSize)) {
          for (const cand of sizeIndex.get(totalSize)) {
            const ch = await ensureCandidateHash(cand);
            if (ch && ch === firstHash) {
              try { await reader.cancel(); } catch {}
              return { ok: true, dupe: true, dupeOf: cand.filename, size: totalSize };
            }
          }
        }
      }
    }

    const buf = Buffer.concat(chunks);
    if (buf.length < 10000) return { ok: false, reason: 'too small ' + buf.length };

    if (firstHash === null && buf.length > 0) {
      firstHash = createHash('md5').update(buf.subarray(0, Math.min(buf.length, HASH_BYTES))).digest('hex');
    }

    // One more dedup pass with the actual final size (some servers omit Content-Length)
    if (firstHash && sizeIndex.has(buf.length)) {
      for (const cand of sizeIndex.get(buf.length)) {
        const ch = await ensureCandidateHash(cand);
        if (ch && ch === firstHash) {
          return { ok: true, dupe: true, dupeOf: cand.filename, size: buf.length };
        }
      }
    }

    writeFileSync(dest, buf);
    if (!sizeIndex.has(buf.length)) sizeIndex.set(buf.length, []);
    sizeIndex.get(buf.length).push({ filename: basename(dest), path: dest, hash: firstHash });
    return { ok: true, size: buf.length };
  } catch (e) {
    return { ok: false, error: e.message };
  }
}

async function resolveEmbed(sessionId, embedUrl) {
  const r = await fsGet(sessionId, embedUrl, '');
  if (r.status !== 200) return null;
  return extractMp4FromHtml(r.html);
}

async function resolveAlbumIds(sessionId, albumUrl) {
  const r = await fsGet(sessionId, albumUrl, '');
  if (r.status !== 200) return [];
  // Album page: each video tile has data-id="<videoId>"
  const ids = new Set();
  const re = /data-id=["']([A-Za-z0-9_-]{6,})["']/g;
  let m;
  while ((m = re.exec(r.html)) !== null) ids.add(m[1]);
  // Also handle direct embed links if present
  const re2 = /turbo\.[a-z]+\/embed\/([A-Za-z0-9_-]+)/g;
  while ((m = re2.exec(r.html)) !== null) ids.add(m[1]);
  return [...ids];
}

/**
 * Resolve a turbo.cr URL (embed or album) and download all videos found.
 * Pass an existing FlareSolverr sessionId to reuse it across many calls;
 * otherwise one is created and destroyed per call.
 *
 * Returns the count of videos successfully downloaded.
 */
export async function downloadTurbo(url, outputDir, logFn, userAgent, fsSession) {
  const ua = userAgent || DEFAULT_UA;
  const folderName = basename(outputDir);
  let ownSession = false;
  let sessionId = fsSession;

  try {
    if (!sessionId) {
      sessionId = await fsCreateSession(TURBO_BASE + '/');
      ownSession = true;
    }

    let embedIds = [];
    let mEmbed = url.match(/turbo\.[a-z]+\/embed\/([A-Za-z0-9_-]+)/i);
    if (mEmbed) {
      embedIds = [mEmbed[1]];
    } else if (/\/a\//i.test(url)) {
      logFn(`turbo: resolving album ${url}`);
      embedIds = await resolveAlbumIds(sessionId, url);
      logFn(`turbo: album has ${embedIds.length} video(s)`);
    } else {
      logFn(`turbo: unrecognized URL ${url}`);
      return 0;
    }

    // Build size index once per call so repeated downloads in the same job
    // share the cache (and the lazy hash cache).
    const sizeIndex = buildSizeIndex(outputDir);

    let count = 0;
    for (const id of embedIds) {
      const embedUrl = `${TURBO_BASE}/embed/${id}`;
      try {
        const mp4 = await resolveEmbed(sessionId, embedUrl);
        if (!mp4) {
          logFn(`turbo: could not resolve mp4 for ${id}`);
          continue;
        }
        const filename = turboFilename(mp4, id);
        const dest = join(outputDir, filename);
        if (existsSync(dest)) {
          logFn(`turbo: already have ${filename}`);
          continue;
        }
        const dl = await downloadVideo(mp4, dest, ua, sizeIndex);
        if (dl.ok && dl.dupe) {
          logFn(`turbo: skip ${filename} — same content as ${dl.dupeOf}`);
        } else if (dl.ok) {
          try { upsertMediaFile(folderName, filename, 'video', dl.size, Date.now(), null); } catch {}
          logFn(`Downloaded: ${filename} (${(dl.size / (1024 * 1024)).toFixed(1)} MB) [video]`);
          count++;
        } else {
          logFn(`turbo: download failed ${filename} - ${dl.status || dl.error || dl.reason}`);
        }
      } catch (e) {
        logFn(`turbo: error for ${id}: ${e.message}`);
      }
    }
    return count;
  } finally {
    if (ownSession && sessionId) await fsDestroySession(sessionId);
  }
}