From a627388a4a7232de85db0f20fe7197c2ce01f093 Mon Sep 17 00:00:00 2001 From: Trey T Date: Thu, 30 Apr 2026 10:33:13 -0500 Subject: [PATCH] turbo: dedup downloads by content hash, not just filename MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Same video gets uploaded to turbo.cr under different IDs and resolves to different filenames, so the existsSync(filename) check can't catch content-duplicates. Switched to the same signature the gallery scanner uses — md5 of the first 64KB plus exact byte-size match — and apply it during the download stream so we abort once a same-content existing file is detected. Avoids re-downloading content the user already has (or has deliberately deleted via the duplicate scanner). Co-Authored-By: Claude Opus 4.6 (1M context) --- server/scrapers/turbo.js | 108 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 102 insertions(+), 6 deletions(-) diff --git a/server/scrapers/turbo.js b/server/scrapers/turbo.js index 5f330ad..738bfa2 100644 --- a/server/scrapers/turbo.js +++ b/server/scrapers/turbo.js @@ -1,8 +1,47 @@ -import { writeFileSync, existsSync } from 'fs'; +import { writeFileSync, existsSync, readdirSync, statSync, createReadStream, unlinkSync } from 'fs'; import { join, basename } from 'path'; +import { createHash } from 'crypto'; import { fsCreateSession, fsDestroySession, fsGet } from '../flaresolverr.js'; import { upsertMediaFile } from '../db.js'; +// Match the duplicate scanner in gallery.js — md5 of first 64KB + exact size. +const HASH_BYTES = 65536; + +function hashFirst64kSync(filePath) { + return new Promise((resolve, reject) => { + const hash = createHash('md5'); + const s = createReadStream(filePath, { start: 0, end: HASH_BYTES - 1 }); + s.on('data', (c) => hash.update(c)); + s.on('end', () => resolve(hash.digest('hex'))); + s.on('error', reject); + }); +} + +// Build size -> [{filename, path, hash:null}] index for the folder. Hashes are +// computed lazily only when a size collision is found. +function buildSizeIndex(folderPath) { + const idx = new Map(); + let entries; + try { entries = readdirSync(folderPath); } catch { return idx; } + for (const name of entries) { + if (name.startsWith('.')) continue; + const p = join(folderPath, name); + try { + const st = statSync(p); + if (!st.isFile()) continue; + if (!idx.has(st.size)) idx.set(st.size, []); + idx.get(st.size).push({ filename: name, path: p, hash: null }); + } catch {} + } + return idx; +} + +async function ensureCandidateHash(c) { + if (c.hash != null) return c.hash; + try { c.hash = await hashFirst64kSync(c.path); } catch { c.hash = ''; } + return c.hash; +} + const TURBO_HOST_RE = /^https?:\/\/(?:www\.)?turbo\.\w+\//i; const TURBO_BASE = 'https://turbo.cr'; const DEFAULT_UA = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'; @@ -36,16 +75,67 @@ function turboFilename(mp4Url, fallbackId) { return (fallbackId || 'turbo') + '.mp4'; } -async function downloadVideo(url, dest, ua) { +// Stream-download a video. After the first 64KB has been received, compute its +// md5 and check it against existing files of identical Content-Length in +// sizeIndex. If a match is found, the stream is aborted and no file is written +// — that's the same dedup signature the gallery scanner uses, so this prevents +// re-downloading content the user has already kept (or deleted by hand). +async function downloadVideo(url, dest, ua, sizeIndex) { try { const r = await fetch(url, { headers: { 'User-Agent': ua, 'Referer': TURBO_BASE + '/' }, - signal: AbortSignal.timeout(600000), // 10 min for big videos + signal: AbortSignal.timeout(600000), }); if (!r.ok) return { ok: false, status: r.status }; - const buf = Buffer.from(await r.arrayBuffer()); + + const totalSize = parseInt(r.headers.get('content-length') || '0', 10); + const reader = r.body.getReader(); + const chunks = []; + let totalRead = 0; + let firstHash = null; + + while (true) { + const { done, value } = await reader.read(); + if (done) break; + chunks.push(value); + totalRead += value.length; + + if (firstHash === null && totalRead >= HASH_BYTES) { + const head = Buffer.concat(chunks).subarray(0, HASH_BYTES); + firstHash = createHash('md5').update(head).digest('hex'); + + if (totalSize > 0 && sizeIndex.has(totalSize)) { + for (const cand of sizeIndex.get(totalSize)) { + const ch = await ensureCandidateHash(cand); + if (ch && ch === firstHash) { + try { await reader.cancel(); } catch {} + return { ok: true, dupe: true, dupeOf: cand.filename, size: totalSize }; + } + } + } + } + } + + const buf = Buffer.concat(chunks); if (buf.length < 10000) return { ok: false, reason: 'too small ' + buf.length }; + + if (firstHash === null && buf.length > 0) { + firstHash = createHash('md5').update(buf.subarray(0, Math.min(buf.length, HASH_BYTES))).digest('hex'); + } + + // One more dedup pass with the actual final size (some servers omit Content-Length) + if (firstHash && sizeIndex.has(buf.length)) { + for (const cand of sizeIndex.get(buf.length)) { + const ch = await ensureCandidateHash(cand); + if (ch && ch === firstHash) { + return { ok: true, dupe: true, dupeOf: cand.filename, size: buf.length }; + } + } + } + writeFileSync(dest, buf); + if (!sizeIndex.has(buf.length)) sizeIndex.set(buf.length, []); + sizeIndex.get(buf.length).push({ filename: basename(dest), path: dest, hash: firstHash }); return { ok: true, size: buf.length }; } catch (e) { return { ok: false, error: e.message }; @@ -104,6 +194,10 @@ export async function downloadTurbo(url, outputDir, logFn, userAgent, fsSession) return 0; } + // Build size index once per call so repeated downloads in the same job + // share the cache (and the lazy hash cache). + const sizeIndex = buildSizeIndex(outputDir); + let count = 0; for (const id of embedIds) { const embedUrl = `${TURBO_BASE}/embed/${id}`; @@ -119,8 +213,10 @@ export async function downloadTurbo(url, outputDir, logFn, userAgent, fsSession) logFn(`turbo: already have ${filename}`); continue; } - const dl = await downloadVideo(mp4, dest, ua); - if (dl.ok) { + const dl = await downloadVideo(mp4, dest, ua, sizeIndex); + if (dl.ok && dl.dupe) { + logFn(`turbo: skip ${filename} — same content as ${dl.dupeOf}`); + } else if (dl.ok) { try { upsertMediaFile(folderName, filename, 'video', dl.size, Date.now(), null); } catch {} logFn(`Downloaded: ${filename} (${(dl.size / (1024 * 1024)).toFixed(1)} MB) [video]`); count++;