import { Router } from 'express'; import { mkdirSync } from 'fs'; import { join } from 'path'; import { scrapeForumPage, getPageUrl, detectMaxPage } from './scrapers/forum.js'; import { parseUserUrl, fetchAllPosts, downloadFiles } from './scrapers/coomer.js'; import { parseMediaUrl, fetchAllMedia, downloadMedia } from './scrapers/medialink.js'; const router = Router(); const MEDIA_PATH = process.env.MEDIA_PATH || './data/media'; const jobsMap = new Map(); let jobCounter = 0; const MAX_COMPLETED = 50; const MAX_LOGS = 200; function createJob(type, config) { const id = `scrape_${Date.now()}_${++jobCounter}`; const job = { id, type, config, progress: { total: 0, completed: 0, errors: 0 }, running: true, cancelled: false, logs: [], startedAt: new Date().toISOString(), completedAt: null, folderName: config.folderName || 'scrape', }; jobsMap.set(id, job); return job; } function addLog(job, msg) { const ts = new Date().toLocaleTimeString('en-US', { hour12: false }); job.logs.push(`[${ts}] ${msg}`); if (job.logs.length > MAX_LOGS) job.logs.shift(); } function pruneCompleted() { const completed = [...jobsMap.values()] .filter(j => !j.running) .sort((a, b) => new Date(b.completedAt) - new Date(a.completedAt)); if (completed.length > MAX_COMPLETED) { for (const old of completed.slice(MAX_COMPLETED)) { jobsMap.delete(old.id); } } } function jobToJson(job) { return { id: job.id, type: job.type, config: job.config, progress: job.progress, running: job.running, cancelled: job.cancelled, folderName: job.folderName, startedAt: job.startedAt, completedAt: job.completedAt, logCount: job.logs.length, }; } // --- Forum Scrape --- async function runForumScrape(job) { const { url, startPage, endPage, delay, folderName } = job.config; const outputDir = join(MEDIA_PATH, folderName); mkdirSync(outputDir, { recursive: true }); const downloadedSet = new Set(); let totalImages = 0; addLog(job, `Starting forum scrape: pages ${startPage}-${endPage}`); addLog(job, `Output: ${outputDir}`); job.progress.total = endPage - startPage + 1; try { for (let page = startPage; page <= endPage; page++) { if (job.cancelled) { addLog(job, 'Cancelled by user'); break; } const pageUrl = getPageUrl(url, page); addLog(job, `--- Page ${page}/${endPage} ---`); const count = await scrapeForumPage(pageUrl, outputDir, downloadedSet, (msg) => addLog(job, msg)); totalImages += count; job.progress.completed = page - startPage + 1; if (page < endPage && !job.cancelled) { await new Promise(r => setTimeout(r, delay * 1000)); } } } catch (err) { addLog(job, `Error: ${err.message}`); job.progress.errors++; } finally { job.running = false; job.completedAt = new Date().toISOString(); addLog(job, `Done! ${totalImages} images saved to ${folderName}/`); pruneCompleted(); } } // --- Coomer Scrape --- async function runCoomerScrape(job) { const { url, pages, workers, folderName } = job.config; const outputDir = join(MEDIA_PATH, folderName); mkdirSync(outputDir, { recursive: true }); addLog(job, `Starting coomer scrape: ${url}`); addLog(job, `Pages: ${pages}, Workers: ${workers}`); try { const { base, service, userId } = parseUserUrl(url); addLog(job, `Site: ${base}, Service: ${service}, User: ${userId}`); // Phase 1: Collect files addLog(job, `Fetching up to ${pages} pages...`); const files = await fetchAllPosts(base, service, userId, pages, (msg) => addLog(job, msg), () => job.cancelled ); if (job.cancelled) { addLog(job, 'Cancelled by user'); return; } if (files.length === 0) { addLog(job, 'No files found'); return; } job.progress.total = files.length; addLog(job, `Found ${files.length} files. Starting downloads...`); // Phase 2: Download const result = await downloadFiles(files, outputDir, workers, (msg) => addLog(job, msg), (completed, errors, total) => { job.progress.completed = completed; job.progress.errors = errors; job.progress.total = total; }, () => job.cancelled ); addLog(job, `Done! ${result.completed} downloaded, ${result.errors} failed, ${result.skipped} skipped`); } catch (err) { addLog(job, `Error: ${err.message}`); job.progress.errors++; } finally { job.running = false; job.completedAt = new Date().toISOString(); pruneCompleted(); } } // --- MediaLink Scrape --- async function runMediaLinkScrape(job) { const { url, pages, workers, delay, folderName } = job.config; const outputDir = join(MEDIA_PATH, folderName); mkdirSync(outputDir, { recursive: true }); addLog(job, `Starting medialink scrape: ${url}`); addLog(job, `Pages: ${pages}, Workers: ${workers}, Delay: ${delay}ms`); try { const { base, userId } = parseMediaUrl(url); addLog(job, `Site: ${base}, User ID: ${userId}`); // Phase 1: Collect all media via JSON API addLog(job, `Fetching up to ${pages} pages from API...`); const items = await fetchAllMedia(base, userId, pages, delay, (msg) => addLog(job, msg), () => job.cancelled ); if (job.cancelled) { addLog(job, 'Cancelled by user'); return; } if (items.length === 0) { addLog(job, 'No media found'); return; } job.progress.total = items.length; addLog(job, `Found ${items.length} media items. Downloading...`); // Phase 2: Download all media files const result = await downloadMedia(items, outputDir, workers, (msg) => addLog(job, msg), (completed, errors, total) => { job.progress.completed = completed; job.progress.errors = errors; job.progress.total = total; }, () => job.cancelled ); addLog(job, `Done! ${result.completed} downloaded, ${result.errors} failed, ${result.skipped} skipped`); } catch (err) { addLog(job, `Error: ${err.message}`); job.progress.errors++; } finally { job.running = false; job.completedAt = new Date().toISOString(); pruneCompleted(); } } // --- Endpoints --- router.post('/api/scrape/forum', (req, res) => { const { url, folderName, startPage, endPage, delay } = req.body; if (!url) return res.status(400).json({ error: 'URL is required' }); if (!folderName) return res.status(400).json({ error: 'Folder name is required' }); const config = { url: url.includes('page-') ? url : `${url.replace(/\/$/, '')}/page-1`, folderName, startPage: parseInt(startPage) || 1, endPage: parseInt(endPage) || 10, delay: parseFloat(delay) || 1.0, }; const job = createJob('forum', config); runForumScrape(job).catch(err => { addLog(job, `Fatal error: ${err.message}`); job.running = false; job.completedAt = new Date().toISOString(); }); res.json({ jobId: job.id, message: 'Forum scrape started' }); }); router.post('/api/scrape/coomer', (req, res) => { const { url, folderName, pages, workers } = req.body; if (!url) return res.status(400).json({ error: 'URL is required' }); if (!folderName) return res.status(400).json({ error: 'Folder name is required' }); const config = { url, folderName, pages: parseInt(pages) || 10, workers: Math.min(Math.max(parseInt(workers) || 10, 1), 20), }; const job = createJob('coomer', config); runCoomerScrape(job).catch(err => { addLog(job, `Fatal error: ${err.message}`); job.running = false; job.completedAt = new Date().toISOString(); }); res.json({ jobId: job.id, message: 'Coomer scrape started' }); }); router.post('/api/scrape/medialink', (req, res) => { const { url, folderName, pages, workers, delay } = req.body; if (!url) return res.status(400).json({ error: 'URL is required' }); if (!folderName) return res.status(400).json({ error: 'Folder name is required' }); const config = { url, folderName, pages: parseInt(pages) || 50, workers: Math.min(Math.max(parseInt(workers) || 3, 1), 10), delay: parseInt(delay) || 500, }; const job = createJob('medialink', config); runMediaLinkScrape(job).catch(err => { addLog(job, `Fatal error: ${err.message}`); job.running = false; job.completedAt = new Date().toISOString(); }); res.json({ jobId: job.id, message: 'MediaLink scrape started' }); }); router.get('/api/scrape/jobs', (_req, res) => { const jobs = [...jobsMap.values()].map(jobToJson); jobs.sort((a, b) => new Date(b.startedAt) - new Date(a.startedAt)); res.json(jobs); }); router.get('/api/scrape/jobs/:jobId', (req, res) => { const job = jobsMap.get(req.params.jobId); if (!job) return res.status(404).json({ error: 'Job not found' }); res.json({ ...jobToJson(job), logs: job.logs }); }); router.post('/api/scrape/jobs/:jobId/cancel', (req, res) => { const job = jobsMap.get(req.params.jobId); if (!job) return res.status(404).json({ error: 'Job not found' }); if (!job.running) return res.status(400).json({ error: 'Job is not running' }); job.cancelled = true; addLog(job, 'Cancel requested'); res.json({ message: 'Cancel requested' }); }); // Auto-detect max page for forum URLs router.post('/api/scrape/forum/detect-pages', async (req, res) => { const { url } = req.body; if (!url) return res.status(400).json({ error: 'URL is required' }); const logs = []; const maxPage = await detectMaxPage(url, (msg) => logs.push(msg)); res.json({ maxPage, logs }); }); export default router;