turbo: dedup downloads by content hash, not just filename

Same video gets uploaded to turbo.cr under different IDs and resolves to
different filenames, so the existsSync(filename) check can't catch
content-duplicates. Switched to the same signature the gallery scanner
uses — md5 of the first 64KB plus exact byte-size match — and apply it
during the download stream so we abort once a same-content existing file
is detected. Avoids re-downloading content the user already has (or has
deliberately deleted via the duplicate scanner).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Trey T
2026-04-30 10:33:13 -05:00
parent e6c0e2292b
commit a627388a4a
+102 -6
View File
@@ -1,8 +1,47 @@
import { writeFileSync, existsSync } from 'fs'; import { writeFileSync, existsSync, readdirSync, statSync, createReadStream, unlinkSync } from 'fs';
import { join, basename } from 'path'; import { join, basename } from 'path';
import { createHash } from 'crypto';
import { fsCreateSession, fsDestroySession, fsGet } from '../flaresolverr.js'; import { fsCreateSession, fsDestroySession, fsGet } from '../flaresolverr.js';
import { upsertMediaFile } from '../db.js'; import { upsertMediaFile } from '../db.js';
// Match the duplicate scanner in gallery.js — md5 of first 64KB + exact size.
const HASH_BYTES = 65536;
function hashFirst64kSync(filePath) {
return new Promise((resolve, reject) => {
const hash = createHash('md5');
const s = createReadStream(filePath, { start: 0, end: HASH_BYTES - 1 });
s.on('data', (c) => hash.update(c));
s.on('end', () => resolve(hash.digest('hex')));
s.on('error', reject);
});
}
// Build size -> [{filename, path, hash:null}] index for the folder. Hashes are
// computed lazily only when a size collision is found.
function buildSizeIndex(folderPath) {
const idx = new Map();
let entries;
try { entries = readdirSync(folderPath); } catch { return idx; }
for (const name of entries) {
if (name.startsWith('.')) continue;
const p = join(folderPath, name);
try {
const st = statSync(p);
if (!st.isFile()) continue;
if (!idx.has(st.size)) idx.set(st.size, []);
idx.get(st.size).push({ filename: name, path: p, hash: null });
} catch {}
}
return idx;
}
async function ensureCandidateHash(c) {
if (c.hash != null) return c.hash;
try { c.hash = await hashFirst64kSync(c.path); } catch { c.hash = ''; }
return c.hash;
}
const TURBO_HOST_RE = /^https?:\/\/(?:www\.)?turbo\.\w+\//i; const TURBO_HOST_RE = /^https?:\/\/(?:www\.)?turbo\.\w+\//i;
const TURBO_BASE = 'https://turbo.cr'; const TURBO_BASE = 'https://turbo.cr';
const DEFAULT_UA = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'; const DEFAULT_UA = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36';
@@ -36,16 +75,67 @@ function turboFilename(mp4Url, fallbackId) {
return (fallbackId || 'turbo') + '.mp4'; return (fallbackId || 'turbo') + '.mp4';
} }
async function downloadVideo(url, dest, ua) { // Stream-download a video. After the first 64KB has been received, compute its
// md5 and check it against existing files of identical Content-Length in
// sizeIndex. If a match is found, the stream is aborted and no file is written
// — that's the same dedup signature the gallery scanner uses, so this prevents
// re-downloading content the user has already kept (or deleted by hand).
async function downloadVideo(url, dest, ua, sizeIndex) {
try { try {
const r = await fetch(url, { const r = await fetch(url, {
headers: { 'User-Agent': ua, 'Referer': TURBO_BASE + '/' }, headers: { 'User-Agent': ua, 'Referer': TURBO_BASE + '/' },
signal: AbortSignal.timeout(600000), // 10 min for big videos signal: AbortSignal.timeout(600000),
}); });
if (!r.ok) return { ok: false, status: r.status }; if (!r.ok) return { ok: false, status: r.status };
const buf = Buffer.from(await r.arrayBuffer());
const totalSize = parseInt(r.headers.get('content-length') || '0', 10);
const reader = r.body.getReader();
const chunks = [];
let totalRead = 0;
let firstHash = null;
while (true) {
const { done, value } = await reader.read();
if (done) break;
chunks.push(value);
totalRead += value.length;
if (firstHash === null && totalRead >= HASH_BYTES) {
const head = Buffer.concat(chunks).subarray(0, HASH_BYTES);
firstHash = createHash('md5').update(head).digest('hex');
if (totalSize > 0 && sizeIndex.has(totalSize)) {
for (const cand of sizeIndex.get(totalSize)) {
const ch = await ensureCandidateHash(cand);
if (ch && ch === firstHash) {
try { await reader.cancel(); } catch {}
return { ok: true, dupe: true, dupeOf: cand.filename, size: totalSize };
}
}
}
}
}
const buf = Buffer.concat(chunks);
if (buf.length < 10000) return { ok: false, reason: 'too small ' + buf.length }; if (buf.length < 10000) return { ok: false, reason: 'too small ' + buf.length };
if (firstHash === null && buf.length > 0) {
firstHash = createHash('md5').update(buf.subarray(0, Math.min(buf.length, HASH_BYTES))).digest('hex');
}
// One more dedup pass with the actual final size (some servers omit Content-Length)
if (firstHash && sizeIndex.has(buf.length)) {
for (const cand of sizeIndex.get(buf.length)) {
const ch = await ensureCandidateHash(cand);
if (ch && ch === firstHash) {
return { ok: true, dupe: true, dupeOf: cand.filename, size: buf.length };
}
}
}
writeFileSync(dest, buf); writeFileSync(dest, buf);
if (!sizeIndex.has(buf.length)) sizeIndex.set(buf.length, []);
sizeIndex.get(buf.length).push({ filename: basename(dest), path: dest, hash: firstHash });
return { ok: true, size: buf.length }; return { ok: true, size: buf.length };
} catch (e) { } catch (e) {
return { ok: false, error: e.message }; return { ok: false, error: e.message };
@@ -104,6 +194,10 @@ export async function downloadTurbo(url, outputDir, logFn, userAgent, fsSession)
return 0; return 0;
} }
// Build size index once per call so repeated downloads in the same job
// share the cache (and the lazy hash cache).
const sizeIndex = buildSizeIndex(outputDir);
let count = 0; let count = 0;
for (const id of embedIds) { for (const id of embedIds) {
const embedUrl = `${TURBO_BASE}/embed/${id}`; const embedUrl = `${TURBO_BASE}/embed/${id}`;
@@ -119,8 +213,10 @@ export async function downloadTurbo(url, outputDir, logFn, userAgent, fsSession)
logFn(`turbo: already have ${filename}`); logFn(`turbo: already have ${filename}`);
continue; continue;
} }
const dl = await downloadVideo(mp4, dest, ua); const dl = await downloadVideo(mp4, dest, ua, sizeIndex);
if (dl.ok) { if (dl.ok && dl.dupe) {
logFn(`turbo: skip ${filename} — same content as ${dl.dupeOf}`);
} else if (dl.ok) {
try { upsertMediaFile(folderName, filename, 'video', dl.size, Date.now(), null); } catch {} try { upsertMediaFile(folderName, filename, 'video', dl.size, Date.now(), null); } catch {}
logFn(`Downloaded: ${filename} (${(dl.size / (1024 * 1024)).toFixed(1)} MB) [video]`); logFn(`Downloaded: ${filename} (${(dl.size / (1024 * 1024)).toFixed(1)} MB) [video]`);
count++; count++;