4ba88d96f4
Mirror of a627388 but for the forum image path. The same image is often
re-uploaded under different filenames across pages/posts, so existsSync
on the target name can't catch content-duplicates. After fetching the
buffer, hash the first 64KB and compare against existing same-size files
in the target folder (same md5+size signature as gallery's duplicate
scanner). Confirmed against a known dani-speegle-2 pair:
skip IMG_79695f8914f20ce38b07.jpg — same content as
72759c89-7e53-4976-839a-7d952c444579.jpg
buildSizeIndex is built once per job in runForumScrape and threaded
through scrapeForumPage → downloadImage; the hash cache amortizes across
all pages in the job.
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
478 lines
16 KiB
JavaScript
478 lines
16 KiB
JavaScript
import * as cheerio from 'cheerio';
|
|
import { createReadStream, createWriteStream, existsSync, mkdirSync, readdirSync, statSync, writeFileSync } from 'fs';
|
|
import { basename, join, extname } from 'path';
|
|
import { pipeline } from 'stream/promises';
|
|
import { execFile } from 'child_process';
|
|
import { promisify } from 'util';
|
|
import { createHash } from 'crypto';
|
|
import { upsertMediaFile } from '../db.js';
|
|
import { fsGet } from '../flaresolverr.js';
|
|
import { isTurboUrl, downloadTurbo } from './turbo.js';
|
|
|
|
const execFileAsync = promisify(execFile);
|
|
|
|
// Match the duplicate scanner in gallery.js and turbo.js — md5 of first 64KB + exact size.
|
|
const HASH_BYTES = 65536;
|
|
|
|
function hashFirst64kSync(filePath) {
|
|
return new Promise((resolve, reject) => {
|
|
const hash = createHash('md5');
|
|
const s = createReadStream(filePath, { start: 0, end: HASH_BYTES - 1 });
|
|
s.on('data', (c) => hash.update(c));
|
|
s.on('end', () => resolve(hash.digest('hex')));
|
|
s.on('error', reject);
|
|
});
|
|
}
|
|
|
|
// Build size -> [{filename, path, hash:null}] index for the folder. Hashes are
|
|
// computed lazily only when a size collision is found.
|
|
export function buildSizeIndex(folderPath) {
|
|
const idx = new Map();
|
|
let entries;
|
|
try { entries = readdirSync(folderPath); } catch { return idx; }
|
|
for (const name of entries) {
|
|
if (name.startsWith('.')) continue;
|
|
const p = join(folderPath, name);
|
|
try {
|
|
const st = statSync(p);
|
|
if (!st.isFile()) continue;
|
|
if (!idx.has(st.size)) idx.set(st.size, []);
|
|
idx.get(st.size).push({ filename: name, path: p, hash: null });
|
|
} catch {}
|
|
}
|
|
return idx;
|
|
}
|
|
|
|
async function ensureCandidateHash(c) {
|
|
if (c.hash != null) return c.hash;
|
|
try { c.hash = await hashFirst64kSync(c.path); } catch { c.hash = ''; }
|
|
return c.hash;
|
|
}
|
|
|
|
const UA = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36';
|
|
const SERVER_IP = '47.185.183.191';
|
|
|
|
export class CookieExpiredError extends Error {
|
|
constructor(statusCode) {
|
|
super(`Cookie expired or invalid (HTTP ${statusCode})`);
|
|
this.name = 'CookieExpiredError';
|
|
this.statusCode = statusCode;
|
|
}
|
|
}
|
|
|
|
// Replace DDoS-Guard __ddg9_ cookie IP with server's IP so cookies work from any browser
|
|
export function fixCookieIp(cookies) {
|
|
if (!cookies) return cookies;
|
|
return cookies.replace(/__ddg9_=[^;]+/, `__ddg9_=${SERVER_IP}`);
|
|
}
|
|
|
|
export const FORUM_UA = UA;
|
|
|
|
const IMAGE_EXTS = new Set(['.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp', '.tiff']);
|
|
const VIDEO_EXTS = new Set(['.mp4', '.mov', '.avi', '.webm', '.mkv', '.m4v', '.wmv', '.flv', '.ts']);
|
|
const SKIP_PATTERNS = ['avatar', 'smilie', 'emoji', 'icon', 'logo', 'button', 'sprite', 'badge', 'rank', 'star', 'dc_thumbnails'];
|
|
|
|
// External hosts that gallery-dl can resolve
|
|
const GALLERY_DL_HOSTS = [
|
|
/saint\d*\.\w+/i,
|
|
/cyberdrop\.\w+/i,
|
|
/bunkr+\.\w+/i,
|
|
/pixeldrain\.com/i,
|
|
/gofile\.io/i,
|
|
/turbo\.\w+/i,
|
|
];
|
|
|
|
function isImageUrl(url) {
|
|
try {
|
|
const path = new URL(url).pathname.toLowerCase();
|
|
return [...IMAGE_EXTS].some(ext => path.endsWith(ext));
|
|
} catch { return false; }
|
|
}
|
|
|
|
function isVideoUrl(url) {
|
|
try {
|
|
const path = new URL(url).pathname.toLowerCase();
|
|
return [...VIDEO_EXTS].some(ext => path.endsWith(ext));
|
|
} catch { return false; }
|
|
}
|
|
|
|
function isMediaUrl(url) {
|
|
return isImageUrl(url) || isVideoUrl(url);
|
|
}
|
|
|
|
function isExternalHost(url) {
|
|
try {
|
|
const hostname = new URL(url).hostname.toLowerCase();
|
|
return GALLERY_DL_HOSTS.some(p => p.test(hostname));
|
|
} catch { return false; }
|
|
}
|
|
|
|
export function getPageUrl(baseUrl, pageNum) {
|
|
const url = baseUrl.replace(/page-\d+/, `page-${pageNum}`);
|
|
return url.split('#')[0];
|
|
}
|
|
|
|
export async function detectMaxPage(baseUrl, logFn, cookies, userAgent, fsSession) {
|
|
try {
|
|
let html;
|
|
if (fsSession) {
|
|
const r = await fsGet(fsSession, baseUrl, cookies);
|
|
if (r.status !== 200) return null;
|
|
html = r.html;
|
|
} else {
|
|
const headers = { 'User-Agent': userAgent || UA };
|
|
if (cookies) headers['Cookie'] = fixCookieIp(cookies);
|
|
const resp = await fetch(baseUrl, { headers, signal: AbortSignal.timeout(15000) });
|
|
if (!resp.ok) return null;
|
|
html = await resp.text();
|
|
}
|
|
const $ = cheerio.load(html);
|
|
|
|
// Derive the thread's URL prefix so we only count pagination that belongs
|
|
// to THIS thread. XenForo sidebars/widgets contain page-N references for
|
|
// unrelated threads, and there are also bare numeric anchors (online count,
|
|
// trending widgets) that look like page numbers but aren't.
|
|
let threadPrefix = '';
|
|
try {
|
|
const u = new URL(baseUrl);
|
|
const m = u.pathname.match(/^(\/threads\/[^\/]+)/);
|
|
if (m) threadPrefix = m[1];
|
|
} catch {}
|
|
|
|
let maxPage = 1;
|
|
if (threadPrefix) {
|
|
const escaped = threadPrefix.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
|
const ownRe = new RegExp(escaped + '/page-(\\d+)', 'g');
|
|
let m;
|
|
while ((m = ownRe.exec(html)) !== null) {
|
|
const n = parseInt(m[1], 10);
|
|
if (n > maxPage && n < 10000) maxPage = n;
|
|
}
|
|
} else {
|
|
// No thread prefix (caller passed a non-thread URL) — fall back to the
|
|
// narrow class-based selectors only, NOT the bare numeric-anchor scan.
|
|
$('a.pageNav-page, .pageNav a[href*="page-"], .pagination a[href*="page-"]').each((_, el) => {
|
|
const href = $(el).attr('href') || '';
|
|
const m = href.match(/page-(\d+)/);
|
|
if (m) maxPage = Math.max(maxPage, parseInt(m[1], 10));
|
|
});
|
|
}
|
|
|
|
if (maxPage > 1) {
|
|
logFn(`Detected ${maxPage} pages`);
|
|
return maxPage;
|
|
}
|
|
return null;
|
|
} catch (err) {
|
|
logFn(`Page detection failed: ${err.message}`);
|
|
return null;
|
|
}
|
|
}
|
|
|
|
function tryFullSizeUrl(thumbUrl) {
|
|
const candidates = [];
|
|
if (thumbUrl.includes('.th.')) candidates.push(thumbUrl.replace('.th.', '.'));
|
|
if (thumbUrl.includes('.md.')) candidates.push(thumbUrl.replace('.md.', '.'));
|
|
if (/_thumb\./i.test(thumbUrl)) candidates.push(thumbUrl.replace(/_thumb\./i, '.'));
|
|
if (thumbUrl.includes('/thumbs/')) {
|
|
candidates.push(thumbUrl.replace('/thumbs/', '/images/'));
|
|
candidates.push(thumbUrl.replace('/thumbs/', '/full/'));
|
|
}
|
|
try {
|
|
const parsed = new URL(thumbUrl);
|
|
const base = basename(parsed.pathname);
|
|
if (base.startsWith('thumb_')) {
|
|
candidates.push(thumbUrl.replace(`/${base}`, `/${base.slice(6)}`));
|
|
}
|
|
if (parsed.search) candidates.push(thumbUrl.split('?')[0]);
|
|
} catch {}
|
|
return candidates;
|
|
}
|
|
|
|
async function downloadImage(url, outputDir, downloadedSet, logFn, cookies, userAgent, sizeIndex) {
|
|
if (downloadedSet.has(url)) return false;
|
|
if (!isImageUrl(url)) return false;
|
|
const lower = url.toLowerCase();
|
|
if (SKIP_PATTERNS.some(p => lower.includes(p))) return false;
|
|
|
|
downloadedSet.add(url);
|
|
|
|
let filename;
|
|
try { filename = basename(new URL(url).pathname); } catch { return false; }
|
|
if (!filename) return false;
|
|
filename = filename.replace('.th.', '.').replace('.md.', '.');
|
|
|
|
const filepath = join(outputDir, filename);
|
|
if (existsSync(filepath)) {
|
|
return false;
|
|
}
|
|
|
|
try {
|
|
const dlHeaders = { 'User-Agent': userAgent || UA };
|
|
if (cookies) dlHeaders['Cookie'] = fixCookieIp(cookies);
|
|
const resp = await fetch(url, { headers: dlHeaders, signal: AbortSignal.timeout(30000) });
|
|
if (!resp.ok) {
|
|
logFn(`FAILED (${resp.status}): ${url}`);
|
|
return false;
|
|
}
|
|
|
|
const buf = Buffer.from(await resp.arrayBuffer());
|
|
if (buf.length < 1000) {
|
|
downloadedSet.delete(url);
|
|
return false;
|
|
}
|
|
|
|
// Content-hash dedup: same image often re-uploaded under different names.
|
|
let newHash = null;
|
|
if (sizeIndex && sizeIndex.has(buf.length)) {
|
|
const head = buf.subarray(0, Math.min(buf.length, HASH_BYTES));
|
|
newHash = createHash('md5').update(head).digest('hex');
|
|
for (const cand of sizeIndex.get(buf.length)) {
|
|
const ch = await ensureCandidateHash(cand);
|
|
if (ch && ch === newHash) {
|
|
logFn(`skip ${filename} — same content as ${cand.filename}`);
|
|
return false;
|
|
}
|
|
}
|
|
}
|
|
|
|
writeFileSync(filepath, buf);
|
|
const savedName = basename(filepath);
|
|
const folderName = basename(outputDir);
|
|
try { upsertMediaFile(folderName, savedName, 'image', buf.length, Date.now(), null); } catch {}
|
|
|
|
if (sizeIndex) {
|
|
if (!sizeIndex.has(buf.length)) sizeIndex.set(buf.length, []);
|
|
sizeIndex.get(buf.length).push({ filename: savedName, path: filepath, hash: newHash });
|
|
}
|
|
|
|
const sizeKb = (buf.length / 1024).toFixed(1);
|
|
logFn(`Downloaded: ${savedName} (${sizeKb} KB)`);
|
|
return true;
|
|
} catch (err) {
|
|
logFn(`FAILED: ${basename(filepath)} - ${err.message}`);
|
|
return false;
|
|
}
|
|
}
|
|
|
|
// Use gallery-dl to download from external hosts (bunkr, saint, cyberdrop, etc.)
|
|
async function downloadFromExternalHost(url, outputDir, downloadedSet, logFn, userAgent, fsSession) {
|
|
if (downloadedSet.has(url)) return 0;
|
|
downloadedSet.add(url);
|
|
|
|
// turbo.cr uses an obfuscated WASM player — gallery-dl can't extract the
|
|
// signed mp4 URL. Resolve via FlareSolverr (renders JS) instead.
|
|
if (isTurboUrl(url)) {
|
|
return await downloadTurbo(url, outputDir, logFn, userAgent, fsSession);
|
|
}
|
|
|
|
logFn(`Resolving via gallery-dl: ${url}`);
|
|
|
|
try {
|
|
const args = [
|
|
'-d', outputDir,
|
|
'--filename', '{filename}.{extension}',
|
|
'--no-mtime',
|
|
'-o', 'directory=[]',
|
|
url,
|
|
];
|
|
|
|
const { stdout, stderr } = await execFileAsync('gallery-dl', args, {
|
|
timeout: 300000, // 5 min per external link
|
|
maxBuffer: 10 * 1024 * 1024,
|
|
});
|
|
|
|
let count = 0;
|
|
const lines = (stdout + '\n' + stderr).split('\n').filter(Boolean);
|
|
for (const line of lines) {
|
|
// gallery-dl outputs file paths for downloaded files
|
|
const trimmed = line.trim();
|
|
if (trimmed.startsWith(outputDir) || trimmed.startsWith('/')) {
|
|
const filePath = trimmed.replace(/^# /, '');
|
|
if (existsSync(filePath)) {
|
|
const stat = statSync(filePath);
|
|
const savedName = basename(filePath);
|
|
const folderName = basename(outputDir);
|
|
const ext = extname(savedName).toLowerCase();
|
|
const type = VIDEO_EXTS.has(ext) ? 'video' : 'image';
|
|
const sizeStr = type === 'video'
|
|
? `${(stat.size / (1024 * 1024)).toFixed(1)} MB`
|
|
: `${(stat.size / 1024).toFixed(1)} KB`;
|
|
|
|
try { upsertMediaFile(folderName, savedName, type, stat.size, Date.now(), null); } catch {}
|
|
logFn(`Downloaded: ${savedName} (${sizeStr}) [${type}]`);
|
|
count++;
|
|
}
|
|
} else if (trimmed.includes('Downloading') || trimmed.includes('Skipping')) {
|
|
logFn(` ${trimmed}`);
|
|
}
|
|
}
|
|
|
|
if (count === 0) {
|
|
// gallery-dl doesn't always output paths clearly, check stderr for errors
|
|
const errLines = stderr ? stderr.split('\n').filter(l => l.trim()) : [];
|
|
for (const line of errLines) {
|
|
if (line.includes('ERROR') || line.includes('error')) {
|
|
logFn(` gallery-dl: ${line.trim()}`);
|
|
}
|
|
}
|
|
logFn(` gallery-dl finished but no files detected from output`);
|
|
}
|
|
|
|
return count;
|
|
} catch (err) {
|
|
if (err.stderr) {
|
|
const errMsg = err.stderr.split('\n').find(l => l.includes('ERROR') || l.includes('error')) || err.stderr.slice(0, 200);
|
|
logFn(`gallery-dl error: ${errMsg.trim()}`);
|
|
} else {
|
|
logFn(`gallery-dl error: ${err.message}`);
|
|
}
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
export async function scrapeForumPage(pageUrl, outputDir, downloadedSet, logFn, cookies, userAgent, fsSession, sizeIndex) {
|
|
logFn(`Fetching page: ${pageUrl}${fsSession ? ' [via FlareSolverr]' : ''}`);
|
|
|
|
let html;
|
|
try {
|
|
if (fsSession) {
|
|
const r = await fsGet(fsSession, pageUrl, cookies);
|
|
if (r.status !== 200) {
|
|
if (cookies && (r.status === 404 || r.status === 403)) {
|
|
throw new CookieExpiredError(r.status);
|
|
}
|
|
logFn(`Failed to fetch page (${r.status})`);
|
|
return 0;
|
|
}
|
|
html = r.html;
|
|
} else {
|
|
const headers = { 'User-Agent': userAgent || UA };
|
|
if (cookies) headers['Cookie'] = fixCookieIp(cookies);
|
|
const resp = await fetch(pageUrl, { headers, signal: AbortSignal.timeout(15000) });
|
|
if (!resp.ok) {
|
|
if (cookies && (resp.status === 404 || resp.status === 403)) {
|
|
throw new CookieExpiredError(resp.status);
|
|
}
|
|
logFn(`Failed to fetch page (${resp.status})`);
|
|
return 0;
|
|
}
|
|
html = await resp.text();
|
|
}
|
|
} catch (err) {
|
|
if (err instanceof CookieExpiredError) throw err;
|
|
logFn(`Failed to fetch page: ${err.message}`);
|
|
return 0;
|
|
}
|
|
|
|
const $ = cheerio.load(html);
|
|
|
|
const selectors = '.message-body, .post-body, .post_body, .postcontent, .messageContent, .bbWrapper, article, .entry-content, .post_message, .post-content, #posts, .threadBody';
|
|
let contentAreas = $(selectors).toArray();
|
|
if (contentAreas.length === 0) {
|
|
contentAreas = [$.root().get(0)];
|
|
}
|
|
|
|
const imageUrls = [];
|
|
const externalUrls = new Set();
|
|
|
|
for (const area of contentAreas) {
|
|
const $area = $(area);
|
|
|
|
// Pass 1: <img> tags
|
|
$area.find('img').each((_, el) => {
|
|
const $img = $(el);
|
|
const src = $img.attr('src') || $img.attr('data-src') || $img.attr('data-url') || '';
|
|
if (!src) return;
|
|
|
|
let absSrc;
|
|
try { absSrc = new URL(src, pageUrl).href; } catch { return; }
|
|
|
|
const $parentA = $img.closest('a');
|
|
if ($parentA.length && $parentA.attr('href')) {
|
|
try {
|
|
const aHref = new URL($parentA.attr('href'), pageUrl).href;
|
|
if (isImageUrl(aHref)) {
|
|
imageUrls.push(aHref);
|
|
return;
|
|
}
|
|
} catch {}
|
|
}
|
|
|
|
const fullCandidates = tryFullSizeUrl(absSrc);
|
|
if (fullCandidates.length > 0) {
|
|
imageUrls.push(...fullCandidates);
|
|
} else {
|
|
imageUrls.push(absSrc);
|
|
}
|
|
|
|
for (const attr of ['data-src', 'data-url', 'data-orig', 'data-original', 'data-full-url', 'data-zoom-src']) {
|
|
const val = $img.attr(attr);
|
|
if (val && val !== src) {
|
|
try { imageUrls.push(new URL(val, pageUrl).href); } catch {}
|
|
}
|
|
}
|
|
});
|
|
|
|
// Pass 2: <a href> links — images + external hosts
|
|
$area.find('a[href]').each((_, el) => {
|
|
const $a = $(el);
|
|
let href;
|
|
try { href = new URL($a.attr('href'), pageUrl).href; } catch { return; }
|
|
|
|
// Skip same-forum links
|
|
try {
|
|
if (new URL(href).hostname === new URL(pageUrl).hostname) return;
|
|
} catch {}
|
|
|
|
// Direct image link (without child img — those are handled in Pass 1)
|
|
if (isImageUrl(href) && $a.find('img').length === 0) {
|
|
imageUrls.push(href);
|
|
return;
|
|
}
|
|
|
|
// Direct video link
|
|
if (isVideoUrl(href)) {
|
|
externalUrls.add(href);
|
|
return;
|
|
}
|
|
|
|
// External file host (bunkr, saint, cyberdrop, etc.)
|
|
if (isExternalHost(href)) {
|
|
externalUrls.add(href);
|
|
}
|
|
});
|
|
|
|
// Pass 3: iframe embeds
|
|
$area.find('iframe[src]').each((_, el) => {
|
|
const src = $(el).attr('src');
|
|
if (src) {
|
|
try {
|
|
const absUrl = new URL(src, pageUrl).href;
|
|
if (isExternalHost(absUrl)) externalUrls.add(absUrl);
|
|
} catch {}
|
|
}
|
|
});
|
|
}
|
|
|
|
logFn(`Found ${imageUrls.length} images, ${externalUrls.size} external links`);
|
|
|
|
let count = 0;
|
|
|
|
// Download images
|
|
for (const imgUrl of imageUrls) {
|
|
if (await downloadImage(imgUrl, outputDir, downloadedSet, logFn, cookies, userAgent, sizeIndex)) {
|
|
count++;
|
|
}
|
|
}
|
|
|
|
// Download from external hosts (turbo.cr handled via FlareSolverr; rest via gallery-dl)
|
|
for (const extUrl of externalUrls) {
|
|
const dlCount = await downloadFromExternalHost(extUrl, outputDir, downloadedSet, logFn, userAgent, fsSession);
|
|
count += dlCount;
|
|
}
|
|
|
|
logFn(`${count} files from this page`);
|
|
return count;
|
|
}
|