import { Router } from 'express'; import { mkdirSync } from 'fs'; import { join } from 'path'; import { scrapeForumPage, getPageUrl, detectMaxPage, CookieExpiredError } from './scrapers/forum.js'; import { refreshForumCookies } from './flaresolverr.js'; import { parseUserUrl, fetchAllPosts, fetchSearchPosts, downloadFiles } from './scrapers/coomer.js'; import { parseMediaUrl, fetchAllMedia, fetchAllMediaFromHtml, downloadMedia } from './scrapers/medialink.js'; import { parseMegaUrl, listAllFiles, downloadMegaFiles } from './scrapers/mega.js'; import { runYtdlp } from './scrapers/ytdlp.js'; import { parseLeakGalleryUrl, fetchAllMedia as fetchLeakGalleryMedia, downloadMedia as downloadLeakGalleryMedia } from './scrapers/leakgallery.js'; import { getAutoScrapeJobs, addAutoScrapeJob, removeAutoScrapeJob, getForumSites, getForumSiteById, createForumSite, updateForumSite, deleteForumSite } from './db.js'; const router = Router(); const MEDIA_PATH = process.env.MEDIA_PATH || './data/media'; const jobsMap = new Map(); let jobCounter = 0; const MAX_COMPLETED = 50; const MAX_LOGS = 200; function createJob(type, config) { const id = `scrape_${Date.now()}_${++jobCounter}`; const job = { id, type, config, progress: { total: 0, completed: 0, errors: 0 }, running: true, cancelled: false, logs: [], startedAt: new Date().toISOString(), completedAt: null, folderName: config.folderName || 'scrape', }; jobsMap.set(id, job); return job; } function addLog(job, msg) { const ts = new Date().toLocaleTimeString('en-US', { hour12: false }); job.logs.push(`[${ts}] ${msg}`); if (job.logs.length > MAX_LOGS) job.logs.shift(); } function pruneCompleted() { const completed = [...jobsMap.values()] .filter(j => !j.running) .sort((a, b) => new Date(b.completedAt) - new Date(a.completedAt)); if (completed.length > MAX_COMPLETED) { for (const old of completed.slice(MAX_COMPLETED)) { jobsMap.delete(old.id); } } } function jobToJson(job) { return { id: job.id, type: job.type, config: job.config, progress: job.progress, running: job.running, cancelled: job.cancelled, paused: job.paused || false, resumeAt: job.resumeAt || null, folderName: job.folderName, startedAt: job.startedAt, completedAt: job.completedAt, logCount: job.logs.length, }; } // --- Forum Scrape --- async function runForumScrape(job) { let { url, startPage, endPage, delay, folderName, siteId, lastPageOnly } = job.config; let { cookies } = job.config; // Load cookies from forum site record if siteId provided and no cookies passed if (!cookies && siteId) { const site = getForumSiteById(siteId); if (site && site.cookies) { cookies = site.cookies; job.config.cookies = cookies; addLog(job, `Loaded cookies from forum site: ${site.name}`); } } const outputDir = join(MEDIA_PATH, folderName); mkdirSync(outputDir, { recursive: true }); const downloadedSet = new Set(); let totalImages = 0; // If lastPageOnly, detect the last page and only scrape that if (lastPageOnly) { addLog(job, 'Detecting last page...'); const maxPage = await detectMaxPage(url, (msg) => addLog(job, msg), cookies); if (maxPage) { startPage = maxPage; endPage = maxPage; addLog(job, `Last page detected: ${maxPage}`); } else { addLog(job, 'Could not detect last page — falling back to page range'); } } addLog(job, `Starting forum scrape: pages ${startPage}-${endPage}`); addLog(job, `Output: ${outputDir}`); job.progress.total = endPage - startPage + 1; try { for (let page = startPage; page <= endPage; page++) { if (job.cancelled) { addLog(job, 'Cancelled by user'); break; } const pageUrl = getPageUrl(url, page); addLog(job, `--- Page ${page}/${endPage} ---`); let count; try { count = await scrapeForumPage(pageUrl, outputDir, downloadedSet, (msg) => addLog(job, msg), cookies); } catch (err) { if (err instanceof CookieExpiredError && siteId) { addLog(job, `Cookie expired (HTTP ${err.statusCode}) — attempting auto-refresh via FlareSolverr...`); try { cookies = await refreshForumCookies(siteId); job.config.cookies = cookies; addLog(job, 'Cookies refreshed successfully — retrying page...'); count = await scrapeForumPage(pageUrl, outputDir, downloadedSet, (msg) => addLog(job, msg), cookies); } catch (refreshErr) { addLog(job, `Cookie refresh failed: ${refreshErr.message}`); addLog(job, 'Stopping scrape — fix credentials or refresh cookies manually'); break; } } else if (err instanceof CookieExpiredError) { addLog(job, `Cookie expired (HTTP ${err.statusCode}) — no siteId configured for auto-refresh`); addLog(job, 'Stopping scrape — refresh cookies manually and try again'); break; } else { throw err; } } totalImages += count; job.progress.completed = page - startPage + 1; if (page < endPage && !job.cancelled) { await new Promise(r => setTimeout(r, delay * 1000)); } } } catch (err) { addLog(job, `Error: ${err.message}`); job.progress.errors++; } finally { job.running = false; job.completedAt = new Date().toISOString(); addLog(job, `Done! ${totalImages} files saved to ${folderName}/`); pruneCompleted(); } } // --- Coomer Scrape --- async function runCoomerScrape(job) { const { url, pages, workers, folderName } = job.config; const outputDir = join(MEDIA_PATH, folderName); mkdirSync(outputDir, { recursive: true }); addLog(job, `Starting coomer scrape: ${url}`); addLog(job, `Pages: ${pages}, Workers: ${workers}`); try { const parsed = parseUserUrl(url); let files; if (parsed.mode === 'search') { addLog(job, `Site: ${parsed.base}, Search: "${parsed.query}"`); addLog(job, `Fetching up to ${pages} pages...`); files = await fetchSearchPosts(parsed.base, parsed.query, pages, (msg) => addLog(job, msg), () => job.cancelled ); } else { addLog(job, `Site: ${parsed.base}, Service: ${parsed.service}, User: ${parsed.userId}`); addLog(job, `Fetching up to ${pages} pages...`); files = await fetchAllPosts(parsed.base, parsed.service, parsed.userId, pages, (msg) => addLog(job, msg), () => job.cancelled ); } if (job.cancelled) { addLog(job, 'Cancelled by user'); return; } if (files.length === 0) { addLog(job, 'No files found'); return; } job.progress.total = files.length; addLog(job, `Found ${files.length} files. Starting downloads...`); // Phase 2: Download const result = await downloadFiles(files, outputDir, workers, (msg) => addLog(job, msg), (completed, errors, total) => { job.progress.completed = completed; job.progress.errors = errors; job.progress.total = total; }, () => job.cancelled ); addLog(job, `Done! ${result.completed} downloaded, ${result.errors} failed, ${result.skipped} skipped`); } catch (err) { addLog(job, `Error: ${err.message}`); job.progress.errors++; } finally { job.running = false; job.completedAt = new Date().toISOString(); pruneCompleted(); } } // --- MediaLink Scrape --- async function runMediaLinkScrape(job) { const { url, pages, workers, delay, folderName } = job.config; const outputDir = join(MEDIA_PATH, folderName); mkdirSync(outputDir, { recursive: true }); addLog(job, `Starting medialink scrape: ${url}`); addLog(job, `Pages: ${pages}, Workers: ${workers}, Delay: ${delay}ms`); try { const { base, userId, mode } = parseMediaUrl(url); addLog(job, `Site: ${base}, ${mode === 'html' ? 'Slug' : 'User ID'}: ${userId} (${mode} mode)`); // Phase 1: Collect all media let items; if (mode === 'html') { addLog(job, `Fetching up to ${pages} pages via HTML scraping...`); items = await fetchAllMediaFromHtml(base, userId, pages, delay, (msg) => addLog(job, msg), () => job.cancelled ); } else { addLog(job, `Fetching up to ${pages} pages from API...`); items = await fetchAllMedia(base, userId, pages, delay, (msg) => addLog(job, msg), () => job.cancelled ); } if (job.cancelled) { addLog(job, 'Cancelled by user'); return; } if (items.length === 0) { addLog(job, 'No media found'); return; } job.progress.total = items.length; addLog(job, `Found ${items.length} media items. Downloading...`); // Phase 2: Download all media files const result = await downloadMedia(items, outputDir, workers, (msg) => addLog(job, msg), (completed, errors, total) => { job.progress.completed = completed; job.progress.errors = errors; job.progress.total = total; }, () => job.cancelled, base + '/' ); addLog(job, `Done! ${result.completed} downloaded, ${result.errors} failed, ${result.skipped} skipped`); } catch (err) { addLog(job, `Error: ${err.message}`); job.progress.errors++; } finally { job.running = false; job.completedAt = new Date().toISOString(); pruneCompleted(); } } // --- Mega Scrape --- async function runMegaScrape(job) { const { url, workers, folderName } = job.config; const outputDir = join(MEDIA_PATH, folderName); mkdirSync(outputDir, { recursive: true }); addLog(job, `Starting mega.nz scrape: ${url}`); addLog(job, `Workers: ${workers}`); try { parseMegaUrl(url); // Phase 1: List all files const { folderName: megaName, items } = await listAllFiles(url, (msg) => addLog(job, msg) ); if (job.cancelled) { addLog(job, 'Cancelled by user'); return; } if (items.length === 0) { addLog(job, 'No files found in folder'); return; } job.progress.total = items.length; const totalSizeMb = (items.reduce((s, i) => s + i.size, 0) / (1024 * 1024)).toFixed(0); addLog(job, `Found ${items.length} files (${totalSizeMb} MB). Downloading...`); // Phase 2: Download const result = await downloadMegaFiles(items, outputDir, workers, (msg) => addLog(job, msg), (completed, errors, total) => { job.progress.completed = completed; job.progress.errors = errors; job.progress.total = total; }, () => job.cancelled, (status) => { job.paused = status.paused; job.resumeAt = status.resumeAt; } ); addLog(job, `Done! ${result.completed} downloaded, ${result.errors} failed, ${result.skipped} skipped`); } catch (err) { addLog(job, `Error: ${err.message}`); job.progress.errors++; } finally { job.running = false; job.completedAt = new Date().toISOString(); pruneCompleted(); } } // --- yt-dlp Scrape --- async function runYtdlpScrape(job) { const config = job.config; addLog(job, `Starting yt-dlp download: ${config.url}`); addLog(job, `Quality: ${config.quality || 'best'}, Playlist: ${config.playlist ? 'yes' : 'no'}`); try { const result = await runYtdlp( config, (msg) => addLog(job, msg), (completed, errors) => { job.progress.completed = completed; job.progress.errors += errors; if (completed > job.progress.total) job.progress.total = completed; }, () => job.cancelled ); if (result.cancelled) { addLog(job, 'Cancelled by user'); } else { addLog(job, `Done! ${result.files} file${result.files !== 1 ? 's' : ''} downloaded`); } } catch (err) { addLog(job, `Error: ${err.message}`); job.progress.errors++; } finally { job.running = false; job.completedAt = new Date().toISOString(); pruneCompleted(); } } // --- LeakGallery Scrape --- async function runLeakGalleryScrape(job) { const { url, pages, workers, delay, folderName } = job.config; const outputDir = join(MEDIA_PATH, folderName); mkdirSync(outputDir, { recursive: true }); addLog(job, `Starting leakgallery scrape: ${url}`); addLog(job, `Pages: ${pages}, Workers: ${workers}, Delay: ${delay}ms`); try { const { username } = parseLeakGalleryUrl(url); addLog(job, `Username: ${username}`); // Phase 1: Collect all media addLog(job, `Fetching up to ${pages} pages from API...`); const items = await fetchLeakGalleryMedia(username, pages, delay, (msg) => addLog(job, msg), () => job.cancelled ); if (job.cancelled) { addLog(job, 'Cancelled by user'); return; } if (items.length === 0) { addLog(job, 'No media found'); return; } job.progress.total = items.length; addLog(job, `Found ${items.length} media items. Downloading...`); // Phase 2: Download all media files const result = await downloadLeakGalleryMedia(items, outputDir, workers, (msg) => addLog(job, msg), (completed, errors, total) => { job.progress.completed = completed; job.progress.errors = errors; job.progress.total = total; }, () => job.cancelled ); addLog(job, `Done! ${result.completed} downloaded, ${result.errors} failed, ${result.skipped} skipped`); } catch (err) { addLog(job, `Error: ${err.message}`); job.progress.errors++; } finally { job.running = false; job.completedAt = new Date().toISOString(); pruneCompleted(); } } // --- Endpoints --- router.post('/api/scrape/forum', (req, res) => { const { url, folderName, startPage, endPage, delay, cookies, siteId, lastPageOnly } = req.body; if (!url) return res.status(400).json({ error: 'URL is required' }); if (!folderName) return res.status(400).json({ error: 'Folder name is required' }); const config = { url: url.includes('page-') ? url : `${url.replace(/\/$/, '')}/page-1`, folderName, startPage: parseInt(startPage) || 1, endPage: parseInt(endPage) || 10, delay: parseFloat(delay) || 1.0, cookies: cookies || '', siteId: siteId ? parseInt(siteId, 10) : null, lastPageOnly: !!lastPageOnly, }; const job = createJob('forum', config); runForumScrape(job).catch(err => { addLog(job, `Fatal error: ${err.message}`); job.running = false; job.completedAt = new Date().toISOString(); }); res.json({ jobId: job.id, message: 'Forum scrape started' }); }); router.post('/api/scrape/coomer', (req, res) => { const { url, folderName, pages, workers } = req.body; if (!url) return res.status(400).json({ error: 'URL is required' }); if (!folderName) return res.status(400).json({ error: 'Folder name is required' }); const config = { url, folderName, pages: parseInt(pages) || 10, workers: Math.min(Math.max(parseInt(workers) || 10, 1), 20), }; const job = createJob('coomer', config); runCoomerScrape(job).catch(err => { addLog(job, `Fatal error: ${err.message}`); job.running = false; job.completedAt = new Date().toISOString(); }); res.json({ jobId: job.id, message: 'Coomer scrape started' }); }); router.post('/api/scrape/medialink', (req, res) => { const { url, folderName, pages, workers, delay } = req.body; if (!url) return res.status(400).json({ error: 'URL is required' }); if (!folderName) return res.status(400).json({ error: 'Folder name is required' }); const config = { url, folderName, pages: parseInt(pages) || 50, workers: Math.min(Math.max(parseInt(workers) || 3, 1), 10), delay: parseInt(delay) || 500, }; const job = createJob('medialink', config); runMediaLinkScrape(job).catch(err => { addLog(job, `Fatal error: ${err.message}`); job.running = false; job.completedAt = new Date().toISOString(); }); res.json({ jobId: job.id, message: 'MediaLink scrape started' }); }); router.post('/api/scrape/mega', (req, res) => { const { url, folderName, workers } = req.body; if (!url) return res.status(400).json({ error: 'URL is required' }); if (!folderName) return res.status(400).json({ error: 'Folder name is required' }); try { parseMegaUrl(url); } catch (err) { return res.status(400).json({ error: err.message }); } const config = { url, folderName, workers: Math.min(Math.max(parseInt(workers) || 3, 1), 10), }; const job = createJob('mega', config); runMegaScrape(job).catch(err => { addLog(job, `Fatal error: ${err.message}`); job.running = false; job.completedAt = new Date().toISOString(); }); res.json({ jobId: job.id, message: 'Mega scrape started' }); }); router.post('/api/scrape/ytdlp', (req, res) => { const { url, quality, customFormat, embedMetadata, embedThumbnail, embedSubs, writeSubs, subLangs, restrictFilenames, outputTemplate, playlist, maxDownloads, concurrentFragments, rateLimit, sponsorBlock, cookiesFile } = req.body; if (!url) return res.status(400).json({ error: 'URL is required' }); const config = { url, quality: quality || 'best', customFormat: customFormat || '', embedMetadata: embedMetadata !== false, embedThumbnail: embedThumbnail !== false, embedSubs: embedSubs !== false, writeSubs: writeSubs || false, subLangs: subLangs || 'en', restrictFilenames: restrictFilenames !== false, outputTemplate: outputTemplate || '%(title)s.%(ext)s', playlist: playlist || false, maxDownloads: parseInt(maxDownloads) || 0, concurrentFragments: Math.min(Math.max(parseInt(concurrentFragments) || 4, 1), 16), rateLimit: rateLimit || '', sponsorBlock: sponsorBlock || 'off', cookiesFile: cookiesFile || '', folderName: (() => { try { const u = new URL(url); const path = u.pathname.replace(/^\//, '').replace(/\/$/, ''); return path ? `${u.hostname}/${path}`.slice(0, 60) : u.hostname; } catch { return url.slice(0, 60); } })(), }; const job = createJob('ytdlp', config); runYtdlpScrape(job).catch(err => { addLog(job, `Fatal error: ${err.message}`); job.running = false; job.completedAt = new Date().toISOString(); }); res.json({ jobId: job.id, message: 'yt-dlp download started' }); }); router.post('/api/scrape/leakgallery', (req, res) => { const { url, folderName, pages, workers, delay } = req.body; if (!url) return res.status(400).json({ error: 'URL is required' }); if (!folderName) return res.status(400).json({ error: 'Folder name is required' }); try { parseLeakGalleryUrl(url); } catch (err) { return res.status(400).json({ error: err.message }); } const config = { url, folderName, pages: parseInt(pages) || 100, workers: Math.min(Math.max(parseInt(workers) || 3, 1), 10), delay: parseInt(delay) || 300, }; const job = createJob('leakgallery', config); runLeakGalleryScrape(job).catch(err => { addLog(job, `Fatal error: ${err.message}`); job.running = false; job.completedAt = new Date().toISOString(); }); res.json({ jobId: job.id, message: 'LeakGallery scrape started' }); }); router.get('/api/scrape/jobs', (_req, res) => { const jobs = [...jobsMap.values()].map(jobToJson); jobs.sort((a, b) => new Date(b.startedAt) - new Date(a.startedAt)); res.json(jobs); }); router.get('/api/scrape/jobs/:jobId', (req, res) => { const job = jobsMap.get(req.params.jobId); if (!job) return res.status(404).json({ error: 'Job not found' }); res.json({ ...jobToJson(job), logs: job.logs }); }); router.post('/api/scrape/jobs/:jobId/cancel', (req, res) => { const job = jobsMap.get(req.params.jobId); if (!job) return res.status(404).json({ error: 'Job not found' }); if (!job.running) return res.status(400).json({ error: 'Job is not running' }); job.cancelled = true; addLog(job, 'Cancel requested'); res.json({ message: 'Cancel requested' }); }); router.delete('/api/scrape/jobs/:jobId', (req, res) => { const job = jobsMap.get(req.params.jobId); if (!job) return res.status(404).json({ error: 'Job not found' }); job.cancelled = true; job.running = false; jobsMap.delete(req.params.jobId); res.json({ message: 'Job removed' }); }); // Auto-detect max page for forum URLs router.post('/api/scrape/forum/detect-pages', async (req, res) => { const { url, cookies } = req.body; if (!url) return res.status(400).json({ error: 'URL is required' }); const logs = []; const maxPage = await detectMaxPage(url, (msg) => logs.push(msg), cookies); res.json({ maxPage, logs }); }); // --- Forum Sites CRUD --- router.get('/api/scrape/forum-sites', (_req, res) => { res.json(getForumSites()); }); router.post('/api/scrape/forum-sites', (req, res) => { const { name, baseUrl, cookies, username, password } = req.body; if (!name) return res.status(400).json({ error: 'Name is required' }); const id = createForumSite(name, baseUrl, cookies, username, password); res.json(getForumSiteById(id)); }); router.put('/api/scrape/forum-sites/:id', (req, res) => { const id = parseInt(req.params.id, 10); const site = getForumSiteById(id); if (!site) return res.status(404).json({ error: 'Forum site not found' }); const { name, baseUrl, cookies, username, password } = req.body; const fields = {}; if (name !== undefined) fields.name = name; if (baseUrl !== undefined) fields.base_url = baseUrl; if (cookies !== undefined) fields.cookies = cookies; if (username !== undefined) fields.username = username; if (password !== undefined) fields.password = password; updateForumSite(id, fields); res.json(getForumSiteById(id)); }); router.delete('/api/scrape/forum-sites/:id', (req, res) => { const id = parseInt(req.params.id, 10); deleteForumSite(id); res.json({ ok: true }); }); // --- Auto-scrape CRUD --- router.get('/api/scrape/auto', (_req, res) => { res.json(getAutoScrapeJobs()); }); router.post('/api/scrape/auto', (req, res) => { const { type, url, folderName, config } = req.body; if (!type || !url || !folderName || !config) { return res.status(400).json({ error: 'type, url, folderName, and config are required' }); } addAutoScrapeJob(type, url, folderName, config); res.json({ ok: true }); }); router.delete('/api/scrape/auto/:id', (req, res) => { removeAutoScrapeJob(parseInt(req.params.id)); res.json({ ok: true }); }); export function getActiveScrapeCount() { let count = 0; for (const job of jobsMap.values()) { if (job.running) count++; } return count; } export function getActiveScrapesList() { const list = []; for (const job of jobsMap.values()) { if (job.running) { list.push({ type: job.type, folderName: job.folderName, progress: job.progress }); } } return list; } export { runForumScrape, runCoomerScrape, runMediaLinkScrape, runMegaScrape, runYtdlpScrape, runLeakGalleryScrape, createJob }; export default router;