import * as cheerio from 'cheerio'; import { createWriteStream, existsSync, mkdirSync, statSync, writeFileSync } from 'fs'; import { basename, join, extname } from 'path'; import { pipeline } from 'stream/promises'; import { execFile } from 'child_process'; import { promisify } from 'util'; import { upsertMediaFile } from '../db.js'; import { fsGet } from '../flaresolverr.js'; import { isTurboUrl, downloadTurbo } from './turbo.js'; const execFileAsync = promisify(execFile); const UA = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'; const SERVER_IP = '47.185.183.191'; export class CookieExpiredError extends Error { constructor(statusCode) { super(`Cookie expired or invalid (HTTP ${statusCode})`); this.name = 'CookieExpiredError'; this.statusCode = statusCode; } } // Replace DDoS-Guard __ddg9_ cookie IP with server's IP so cookies work from any browser export function fixCookieIp(cookies) { if (!cookies) return cookies; return cookies.replace(/__ddg9_=[^;]+/, `__ddg9_=${SERVER_IP}`); } export const FORUM_UA = UA; const IMAGE_EXTS = new Set(['.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp', '.tiff']); const VIDEO_EXTS = new Set(['.mp4', '.mov', '.avi', '.webm', '.mkv', '.m4v', '.wmv', '.flv', '.ts']); const SKIP_PATTERNS = ['avatar', 'smilie', 'emoji', 'icon', 'logo', 'button', 'sprite', 'badge', 'rank', 'star', 'dc_thumbnails']; // External hosts that gallery-dl can resolve const GALLERY_DL_HOSTS = [ /saint\d*\.\w+/i, /cyberdrop\.\w+/i, /bunkr+\.\w+/i, /pixeldrain\.com/i, /gofile\.io/i, /turbo\.\w+/i, ]; function isImageUrl(url) { try { const path = new URL(url).pathname.toLowerCase(); return [...IMAGE_EXTS].some(ext => path.endsWith(ext)); } catch { return false; } } function isVideoUrl(url) { try { const path = new URL(url).pathname.toLowerCase(); return [...VIDEO_EXTS].some(ext => path.endsWith(ext)); } catch { return false; } } function isMediaUrl(url) { return isImageUrl(url) || isVideoUrl(url); } function isExternalHost(url) { try { const hostname = new URL(url).hostname.toLowerCase(); return GALLERY_DL_HOSTS.some(p => p.test(hostname)); } catch { return false; } } export function getPageUrl(baseUrl, pageNum) { const url = baseUrl.replace(/page-\d+/, `page-${pageNum}`); return url.split('#')[0]; } export async function detectMaxPage(baseUrl, logFn, cookies, userAgent, fsSession) { try { let html; if (fsSession) { const r = await fsGet(fsSession, baseUrl, cookies); if (r.status !== 200) return null; html = r.html; } else { const headers = { 'User-Agent': userAgent || UA }; if (cookies) headers['Cookie'] = fixCookieIp(cookies); const resp = await fetch(baseUrl, { headers, signal: AbortSignal.timeout(15000) }); if (!resp.ok) return null; html = await resp.text(); } const $ = cheerio.load(html); let maxPage = 1; $('a.pageNav-page, .pageNav a[href*="page-"], .pagination a[href*="page-"]').each((_, el) => { const href = $(el).attr('href') || ''; const m = href.match(/page-(\d+)/); if (m) maxPage = Math.max(maxPage, parseInt(m[1], 10)); }); $('a').each((_, el) => { const text = $(el).text().trim(); if (/^\d+$/.test(text)) { const n = parseInt(text, 10); if (n > maxPage && n < 10000) maxPage = n; } }); // Final fallback: scan raw HTML for any page-N references (XenForo's // serialized pagination sometimes only appears in href attributes that // cheerio's class-based selectors miss). if (maxPage === 1) { const re = /page-(\d+)/g; let m; while ((m = re.exec(html)) !== null) { const n = parseInt(m[1], 10); if (n > maxPage && n < 10000) maxPage = n; } } if (maxPage > 1) { logFn(`Detected ${maxPage} pages`); return maxPage; } return null; } catch (err) { logFn(`Page detection failed: ${err.message}`); return null; } } function tryFullSizeUrl(thumbUrl) { const candidates = []; if (thumbUrl.includes('.th.')) candidates.push(thumbUrl.replace('.th.', '.')); if (thumbUrl.includes('.md.')) candidates.push(thumbUrl.replace('.md.', '.')); if (/_thumb\./i.test(thumbUrl)) candidates.push(thumbUrl.replace(/_thumb\./i, '.')); if (thumbUrl.includes('/thumbs/')) { candidates.push(thumbUrl.replace('/thumbs/', '/images/')); candidates.push(thumbUrl.replace('/thumbs/', '/full/')); } try { const parsed = new URL(thumbUrl); const base = basename(parsed.pathname); if (base.startsWith('thumb_')) { candidates.push(thumbUrl.replace(`/${base}`, `/${base.slice(6)}`)); } if (parsed.search) candidates.push(thumbUrl.split('?')[0]); } catch {} return candidates; } async function downloadImage(url, outputDir, downloadedSet, logFn, cookies, userAgent) { if (downloadedSet.has(url)) return false; if (!isImageUrl(url)) return false; const lower = url.toLowerCase(); if (SKIP_PATTERNS.some(p => lower.includes(p))) return false; downloadedSet.add(url); let filename; try { filename = basename(new URL(url).pathname); } catch { return false; } if (!filename) return false; filename = filename.replace('.th.', '.').replace('.md.', '.'); const filepath = join(outputDir, filename); if (existsSync(filepath)) { return false; } try { const dlHeaders = { 'User-Agent': userAgent || UA }; if (cookies) dlHeaders['Cookie'] = fixCookieIp(cookies); const resp = await fetch(url, { headers: dlHeaders, signal: AbortSignal.timeout(30000) }); if (!resp.ok) { logFn(`FAILED (${resp.status}): ${url}`); return false; } const buf = Buffer.from(await resp.arrayBuffer()); if (buf.length < 1000) { downloadedSet.delete(url); return false; } writeFileSync(filepath, buf); const savedName = basename(filepath); const folderName = basename(outputDir); try { upsertMediaFile(folderName, savedName, 'image', buf.length, Date.now(), null); } catch {} const sizeKb = (buf.length / 1024).toFixed(1); logFn(`Downloaded: ${savedName} (${sizeKb} KB)`); return true; } catch (err) { logFn(`FAILED: ${basename(filepath)} - ${err.message}`); return false; } } // Use gallery-dl to download from external hosts (bunkr, saint, cyberdrop, etc.) async function downloadFromExternalHost(url, outputDir, downloadedSet, logFn, userAgent, fsSession) { if (downloadedSet.has(url)) return 0; downloadedSet.add(url); // turbo.cr uses an obfuscated WASM player — gallery-dl can't extract the // signed mp4 URL. Resolve via FlareSolverr (renders JS) instead. if (isTurboUrl(url)) { return await downloadTurbo(url, outputDir, logFn, userAgent, fsSession); } logFn(`Resolving via gallery-dl: ${url}`); try { const args = [ '-d', outputDir, '--filename', '{filename}.{extension}', '--no-mtime', '-o', 'directory=[]', url, ]; const { stdout, stderr } = await execFileAsync('gallery-dl', args, { timeout: 300000, // 5 min per external link maxBuffer: 10 * 1024 * 1024, }); let count = 0; const lines = (stdout + '\n' + stderr).split('\n').filter(Boolean); for (const line of lines) { // gallery-dl outputs file paths for downloaded files const trimmed = line.trim(); if (trimmed.startsWith(outputDir) || trimmed.startsWith('/')) { const filePath = trimmed.replace(/^# /, ''); if (existsSync(filePath)) { const stat = statSync(filePath); const savedName = basename(filePath); const folderName = basename(outputDir); const ext = extname(savedName).toLowerCase(); const type = VIDEO_EXTS.has(ext) ? 'video' : 'image'; const sizeStr = type === 'video' ? `${(stat.size / (1024 * 1024)).toFixed(1)} MB` : `${(stat.size / 1024).toFixed(1)} KB`; try { upsertMediaFile(folderName, savedName, type, stat.size, Date.now(), null); } catch {} logFn(`Downloaded: ${savedName} (${sizeStr}) [${type}]`); count++; } } else if (trimmed.includes('Downloading') || trimmed.includes('Skipping')) { logFn(` ${trimmed}`); } } if (count === 0) { // gallery-dl doesn't always output paths clearly, check stderr for errors const errLines = stderr ? stderr.split('\n').filter(l => l.trim()) : []; for (const line of errLines) { if (line.includes('ERROR') || line.includes('error')) { logFn(` gallery-dl: ${line.trim()}`); } } logFn(` gallery-dl finished but no files detected from output`); } return count; } catch (err) { if (err.stderr) { const errMsg = err.stderr.split('\n').find(l => l.includes('ERROR') || l.includes('error')) || err.stderr.slice(0, 200); logFn(`gallery-dl error: ${errMsg.trim()}`); } else { logFn(`gallery-dl error: ${err.message}`); } return 0; } } export async function scrapeForumPage(pageUrl, outputDir, downloadedSet, logFn, cookies, userAgent, fsSession) { logFn(`Fetching page: ${pageUrl}${fsSession ? ' [via FlareSolverr]' : ''}`); let html; try { if (fsSession) { const r = await fsGet(fsSession, pageUrl, cookies); if (r.status !== 200) { if (cookies && (r.status === 404 || r.status === 403)) { throw new CookieExpiredError(r.status); } logFn(`Failed to fetch page (${r.status})`); return 0; } html = r.html; } else { const headers = { 'User-Agent': userAgent || UA }; if (cookies) headers['Cookie'] = fixCookieIp(cookies); const resp = await fetch(pageUrl, { headers, signal: AbortSignal.timeout(15000) }); if (!resp.ok) { if (cookies && (resp.status === 404 || resp.status === 403)) { throw new CookieExpiredError(resp.status); } logFn(`Failed to fetch page (${resp.status})`); return 0; } html = await resp.text(); } } catch (err) { if (err instanceof CookieExpiredError) throw err; logFn(`Failed to fetch page: ${err.message}`); return 0; } const $ = cheerio.load(html); const selectors = '.message-body, .post-body, .post_body, .postcontent, .messageContent, .bbWrapper, article, .entry-content, .post_message, .post-content, #posts, .threadBody'; let contentAreas = $(selectors).toArray(); if (contentAreas.length === 0) { contentAreas = [$.root().get(0)]; } const imageUrls = []; const externalUrls = new Set(); for (const area of contentAreas) { const $area = $(area); // Pass 1: tags $area.find('img').each((_, el) => { const $img = $(el); const src = $img.attr('src') || $img.attr('data-src') || $img.attr('data-url') || ''; if (!src) return; let absSrc; try { absSrc = new URL(src, pageUrl).href; } catch { return; } const $parentA = $img.closest('a'); if ($parentA.length && $parentA.attr('href')) { try { const aHref = new URL($parentA.attr('href'), pageUrl).href; if (isImageUrl(aHref)) { imageUrls.push(aHref); return; } } catch {} } const fullCandidates = tryFullSizeUrl(absSrc); if (fullCandidates.length > 0) { imageUrls.push(...fullCandidates); } else { imageUrls.push(absSrc); } for (const attr of ['data-src', 'data-url', 'data-orig', 'data-original', 'data-full-url', 'data-zoom-src']) { const val = $img.attr(attr); if (val && val !== src) { try { imageUrls.push(new URL(val, pageUrl).href); } catch {} } } }); // Pass 2: links — images + external hosts $area.find('a[href]').each((_, el) => { const $a = $(el); let href; try { href = new URL($a.attr('href'), pageUrl).href; } catch { return; } // Skip same-forum links try { if (new URL(href).hostname === new URL(pageUrl).hostname) return; } catch {} // Direct image link (without child img — those are handled in Pass 1) if (isImageUrl(href) && $a.find('img').length === 0) { imageUrls.push(href); return; } // Direct video link if (isVideoUrl(href)) { externalUrls.add(href); return; } // External file host (bunkr, saint, cyberdrop, etc.) if (isExternalHost(href)) { externalUrls.add(href); } }); // Pass 3: iframe embeds $area.find('iframe[src]').each((_, el) => { const src = $(el).attr('src'); if (src) { try { const absUrl = new URL(src, pageUrl).href; if (isExternalHost(absUrl)) externalUrls.add(absUrl); } catch {} } }); } logFn(`Found ${imageUrls.length} images, ${externalUrls.size} external links`); let count = 0; // Download images for (const imgUrl of imageUrls) { if (await downloadImage(imgUrl, outputDir, downloadedSet, logFn, cookies, userAgent)) { count++; } } // Download from external hosts (turbo.cr handled via FlareSolverr; rest via gallery-dl) for (const extUrl of externalUrls) { const dlCount = await downloadFromExternalHost(extUrl, outputDir, downloadedSet, logFn, userAgent, fsSession); count += dlCount; } logFn(`${count} files from this page`); return count; }