aa4f1157d1
DDoS-Guard now binds session cookies to the issuing browser's fingerprint, so direct Node fetch returns 403 even with valid cookies. Page HTML for any forum_site with stored cookies is now fetched via a FlareSolverr browser session opened once per scrape job. - Hybrid cookie refresh: FlareSolverr clears the DDoS-Guard captcha, those cookies seed undetected_chromedriver, Turnstile auto-solves in the real browser, login form submits, final cookies + browser UA persist to forum_sites - Per-site user_agent column so subsequent scraper requests match the UA the cookies were issued for (DDoS-Guard rejects UA mismatches) - XenForo search rewritten as proper CSRF POST /search/search → results page parse, replacing the broken ?q=... GET that only returned the search form - Pagination regex fallback in detectMaxPage catches XenForo pages that cheerio's class-based selectors miss - New scrapers/turbo.js handles turbo.cr /embed/ and /a/ URLs by rendering the page via FlareSolverr and grabbing the signed mp4 from the resolved <video src> attribute (gallery-dl can't extract these — obfuscated WASM) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
868 lines
29 KiB
JavaScript
868 lines
29 KiB
JavaScript
import { Router } from 'express';
|
|
import { mkdirSync } from 'fs';
|
|
import { join } from 'path';
|
|
import * as cheerio from 'cheerio';
|
|
import { scrapeForumPage, getPageUrl, detectMaxPage, CookieExpiredError, fixCookieIp, FORUM_UA } from './scrapers/forum.js';
|
|
import { refreshForumCookies, fsCreateSession, fsDestroySession, fsGet, fsPost } from './flaresolverr.js';
|
|
import { parseUserUrl, fetchAllPosts, fetchSearchPosts, downloadFiles } from './scrapers/coomer.js';
|
|
import { parseMediaUrl, fetchAllMedia, fetchAllMediaFromHtml, downloadMedia } from './scrapers/medialink.js';
|
|
import { parseMegaUrl, listAllFiles, downloadMegaFiles } from './scrapers/mega.js';
|
|
import { runYtdlp } from './scrapers/ytdlp.js';
|
|
import { parseLeakGalleryUrl, fetchAllMedia as fetchLeakGalleryMedia, downloadMedia as downloadLeakGalleryMedia } from './scrapers/leakgallery.js';
|
|
import { getAutoScrapeJobs, addAutoScrapeJob, removeAutoScrapeJob, getForumSites, getForumSiteById, createForumSite, updateForumSite, deleteForumSite } from './db.js';
|
|
|
|
const router = Router();
|
|
const MEDIA_PATH = process.env.MEDIA_PATH || './data/media';
|
|
|
|
const jobsMap = new Map();
|
|
let jobCounter = 0;
|
|
const MAX_COMPLETED = 50;
|
|
const MAX_LOGS = 200;
|
|
|
|
function createJob(type, config) {
|
|
const id = `scrape_${Date.now()}_${++jobCounter}`;
|
|
const job = {
|
|
id,
|
|
type,
|
|
config,
|
|
progress: { total: 0, completed: 0, errors: 0 },
|
|
running: true,
|
|
cancelled: false,
|
|
logs: [],
|
|
startedAt: new Date().toISOString(),
|
|
completedAt: null,
|
|
folderName: config.folderName || 'scrape',
|
|
};
|
|
jobsMap.set(id, job);
|
|
return job;
|
|
}
|
|
|
|
function addLog(job, msg) {
|
|
const ts = new Date().toLocaleTimeString('en-US', { hour12: false });
|
|
job.logs.push(`[${ts}] ${msg}`);
|
|
if (job.logs.length > MAX_LOGS) job.logs.shift();
|
|
}
|
|
|
|
function pruneCompleted() {
|
|
const completed = [...jobsMap.values()]
|
|
.filter(j => !j.running)
|
|
.sort((a, b) => new Date(b.completedAt) - new Date(a.completedAt));
|
|
if (completed.length > MAX_COMPLETED) {
|
|
for (const old of completed.slice(MAX_COMPLETED)) {
|
|
jobsMap.delete(old.id);
|
|
}
|
|
}
|
|
}
|
|
|
|
function jobToJson(job) {
|
|
return {
|
|
id: job.id,
|
|
type: job.type,
|
|
config: job.config,
|
|
progress: job.progress,
|
|
running: job.running,
|
|
cancelled: job.cancelled,
|
|
paused: job.paused || false,
|
|
resumeAt: job.resumeAt || null,
|
|
folderName: job.folderName,
|
|
startedAt: job.startedAt,
|
|
completedAt: job.completedAt,
|
|
logCount: job.logs.length,
|
|
};
|
|
}
|
|
|
|
// --- Forum Scrape ---
|
|
|
|
async function runForumScrape(job) {
|
|
let { url, startPage, endPage, delay, folderName, siteId, lastPageOnly } = job.config;
|
|
let { cookies } = job.config;
|
|
let userAgent = job.config.userAgent || '';
|
|
|
|
// Load cookies from forum site record if siteId provided and no cookies passed
|
|
if (!cookies && siteId) {
|
|
const site = getForumSiteById(siteId);
|
|
if (site && site.cookies) {
|
|
cookies = site.cookies;
|
|
userAgent = site.user_agent || userAgent;
|
|
job.config.cookies = cookies;
|
|
job.config.userAgent = userAgent;
|
|
addLog(job, `Loaded cookies from forum site: ${site.name}${userAgent ? ` (UA pinned)` : ''}`);
|
|
}
|
|
}
|
|
|
|
const outputDir = join(MEDIA_PATH, folderName);
|
|
mkdirSync(outputDir, { recursive: true });
|
|
|
|
const downloadedSet = new Set();
|
|
let totalImages = 0;
|
|
|
|
// When a siteId is in play, page HTML must be fetched through FlareSolverr —
|
|
// direct fetch hits DDoS-Guard's browser-fingerprint check and gets 403.
|
|
let fsSession = null;
|
|
if (siteId && cookies) {
|
|
try {
|
|
const baseHost = new URL(url).origin;
|
|
addLog(job, `Opening FlareSolverr session for ${baseHost}...`);
|
|
fsSession = await fsCreateSession(baseHost + '/');
|
|
addLog(job, `FlareSolverr session ready (${fsSession.slice(0, 8)}...)`);
|
|
} catch (e) {
|
|
addLog(job, `FlareSolverr session failed (${e.message}) — falling back to direct fetch`);
|
|
}
|
|
}
|
|
|
|
// If lastPageOnly, detect the last page and only scrape that
|
|
if (lastPageOnly) {
|
|
addLog(job, 'Detecting last page...');
|
|
const maxPage = await detectMaxPage(url, (msg) => addLog(job, msg), cookies, userAgent, fsSession);
|
|
if (maxPage) {
|
|
startPage = maxPage;
|
|
endPage = maxPage;
|
|
addLog(job, `Last page detected: ${maxPage}`);
|
|
} else {
|
|
addLog(job, 'Could not detect last page — falling back to page range');
|
|
}
|
|
}
|
|
|
|
addLog(job, `Starting forum scrape: pages ${startPage}-${endPage}`);
|
|
addLog(job, `Output: ${outputDir}`);
|
|
|
|
job.progress.total = endPage - startPage + 1;
|
|
|
|
try {
|
|
for (let page = startPage; page <= endPage; page++) {
|
|
if (job.cancelled) {
|
|
addLog(job, 'Cancelled by user');
|
|
break;
|
|
}
|
|
|
|
const pageUrl = getPageUrl(url, page);
|
|
addLog(job, `--- Page ${page}/${endPage} ---`);
|
|
|
|
let count;
|
|
try {
|
|
count = await scrapeForumPage(pageUrl, outputDir, downloadedSet, (msg) => addLog(job, msg), cookies, userAgent, fsSession);
|
|
} catch (err) {
|
|
if (err instanceof CookieExpiredError && siteId) {
|
|
addLog(job, `Cookie expired (HTTP ${err.statusCode}) — attempting auto-refresh via FlareSolverr...`);
|
|
try {
|
|
cookies = await refreshForumCookies(siteId);
|
|
const refreshed = getForumSiteById(siteId);
|
|
userAgent = refreshed?.user_agent || userAgent;
|
|
job.config.cookies = cookies;
|
|
job.config.userAgent = userAgent;
|
|
addLog(job, 'Cookies refreshed successfully — retrying page...');
|
|
count = await scrapeForumPage(pageUrl, outputDir, downloadedSet, (msg) => addLog(job, msg), cookies, userAgent, fsSession);
|
|
} catch (refreshErr) {
|
|
addLog(job, `Cookie refresh failed: ${refreshErr.message}`);
|
|
addLog(job, 'Stopping scrape — fix credentials or refresh cookies manually');
|
|
break;
|
|
}
|
|
} else if (err instanceof CookieExpiredError) {
|
|
addLog(job, `Cookie expired (HTTP ${err.statusCode}) — no siteId configured for auto-refresh`);
|
|
addLog(job, 'Stopping scrape — refresh cookies manually and try again');
|
|
break;
|
|
} else {
|
|
throw err;
|
|
}
|
|
}
|
|
|
|
totalImages += count;
|
|
job.progress.completed = page - startPage + 1;
|
|
|
|
if (page < endPage && !job.cancelled) {
|
|
await new Promise(r => setTimeout(r, delay * 1000));
|
|
}
|
|
}
|
|
} catch (err) {
|
|
addLog(job, `Error: ${err.message}`);
|
|
job.progress.errors++;
|
|
} finally {
|
|
if (fsSession) {
|
|
await fsDestroySession(fsSession);
|
|
addLog(job, `FlareSolverr session closed`);
|
|
}
|
|
job.running = false;
|
|
job.completedAt = new Date().toISOString();
|
|
addLog(job, `Done! ${totalImages} files saved to ${folderName}/`);
|
|
pruneCompleted();
|
|
}
|
|
}
|
|
|
|
// --- Coomer Scrape ---
|
|
|
|
async function runCoomerScrape(job) {
|
|
const { url, pages, workers, folderName } = job.config;
|
|
const outputDir = join(MEDIA_PATH, folderName);
|
|
mkdirSync(outputDir, { recursive: true });
|
|
|
|
addLog(job, `Starting coomer scrape: ${url}`);
|
|
addLog(job, `Pages: ${pages}, Workers: ${workers}`);
|
|
|
|
try {
|
|
const parsed = parseUserUrl(url);
|
|
let files;
|
|
|
|
if (parsed.mode === 'search') {
|
|
addLog(job, `Site: ${parsed.base}, Search: "${parsed.query}"`);
|
|
addLog(job, `Fetching up to ${pages} pages...`);
|
|
files = await fetchSearchPosts(parsed.base, parsed.query, pages,
|
|
(msg) => addLog(job, msg),
|
|
() => job.cancelled
|
|
);
|
|
} else {
|
|
addLog(job, `Site: ${parsed.base}, Service: ${parsed.service}, User: ${parsed.userId}`);
|
|
addLog(job, `Fetching up to ${pages} pages...`);
|
|
files = await fetchAllPosts(parsed.base, parsed.service, parsed.userId, pages,
|
|
(msg) => addLog(job, msg),
|
|
() => job.cancelled
|
|
);
|
|
}
|
|
|
|
if (job.cancelled) {
|
|
addLog(job, 'Cancelled by user');
|
|
return;
|
|
}
|
|
|
|
if (files.length === 0) {
|
|
addLog(job, 'No files found');
|
|
return;
|
|
}
|
|
|
|
job.progress.total = files.length;
|
|
addLog(job, `Found ${files.length} files. Starting downloads...`);
|
|
|
|
// Phase 2: Download
|
|
const result = await downloadFiles(files, outputDir, workers,
|
|
(msg) => addLog(job, msg),
|
|
(completed, errors, total) => {
|
|
job.progress.completed = completed;
|
|
job.progress.errors = errors;
|
|
job.progress.total = total;
|
|
},
|
|
() => job.cancelled
|
|
);
|
|
|
|
addLog(job, `Done! ${result.completed} downloaded, ${result.errors} failed, ${result.skipped} skipped`);
|
|
} catch (err) {
|
|
addLog(job, `Error: ${err.message}`);
|
|
job.progress.errors++;
|
|
} finally {
|
|
job.running = false;
|
|
job.completedAt = new Date().toISOString();
|
|
pruneCompleted();
|
|
}
|
|
}
|
|
|
|
// --- MediaLink Scrape ---
|
|
|
|
async function runMediaLinkScrape(job) {
|
|
const { url, pages, workers, delay, folderName } = job.config;
|
|
const outputDir = join(MEDIA_PATH, folderName);
|
|
mkdirSync(outputDir, { recursive: true });
|
|
|
|
addLog(job, `Starting medialink scrape: ${url}`);
|
|
addLog(job, `Pages: ${pages}, Workers: ${workers}, Delay: ${delay}ms`);
|
|
|
|
try {
|
|
const { base, userId, mode } = parseMediaUrl(url);
|
|
addLog(job, `Site: ${base}, ${mode === 'html' ? 'Slug' : 'User ID'}: ${userId} (${mode} mode)`);
|
|
|
|
// Phase 1: Collect all media
|
|
let items;
|
|
if (mode === 'html') {
|
|
addLog(job, `Fetching up to ${pages} pages via HTML scraping...`);
|
|
items = await fetchAllMediaFromHtml(base, userId, pages, delay,
|
|
(msg) => addLog(job, msg),
|
|
() => job.cancelled
|
|
);
|
|
} else {
|
|
addLog(job, `Fetching up to ${pages} pages from API...`);
|
|
items = await fetchAllMedia(base, userId, pages, delay,
|
|
(msg) => addLog(job, msg),
|
|
() => job.cancelled
|
|
);
|
|
}
|
|
|
|
if (job.cancelled) {
|
|
addLog(job, 'Cancelled by user');
|
|
return;
|
|
}
|
|
|
|
if (items.length === 0) {
|
|
addLog(job, 'No media found');
|
|
return;
|
|
}
|
|
|
|
job.progress.total = items.length;
|
|
addLog(job, `Found ${items.length} media items. Downloading...`);
|
|
|
|
// Phase 2: Download all media files
|
|
const result = await downloadMedia(items, outputDir, workers,
|
|
(msg) => addLog(job, msg),
|
|
(completed, errors, total) => {
|
|
job.progress.completed = completed;
|
|
job.progress.errors = errors;
|
|
job.progress.total = total;
|
|
},
|
|
() => job.cancelled,
|
|
base + '/'
|
|
);
|
|
|
|
addLog(job, `Done! ${result.completed} downloaded, ${result.errors} failed, ${result.skipped} skipped`);
|
|
} catch (err) {
|
|
addLog(job, `Error: ${err.message}`);
|
|
job.progress.errors++;
|
|
} finally {
|
|
job.running = false;
|
|
job.completedAt = new Date().toISOString();
|
|
pruneCompleted();
|
|
}
|
|
}
|
|
|
|
// --- Mega Scrape ---
|
|
|
|
async function runMegaScrape(job) {
|
|
const { url, workers, folderName } = job.config;
|
|
const outputDir = join(MEDIA_PATH, folderName);
|
|
mkdirSync(outputDir, { recursive: true });
|
|
|
|
addLog(job, `Starting mega.nz scrape: ${url}`);
|
|
addLog(job, `Workers: ${workers}`);
|
|
|
|
try {
|
|
parseMegaUrl(url);
|
|
|
|
// Phase 1: List all files
|
|
const { folderName: megaName, items } = await listAllFiles(url,
|
|
(msg) => addLog(job, msg)
|
|
);
|
|
|
|
if (job.cancelled) {
|
|
addLog(job, 'Cancelled by user');
|
|
return;
|
|
}
|
|
|
|
if (items.length === 0) {
|
|
addLog(job, 'No files found in folder');
|
|
return;
|
|
}
|
|
|
|
job.progress.total = items.length;
|
|
const totalSizeMb = (items.reduce((s, i) => s + i.size, 0) / (1024 * 1024)).toFixed(0);
|
|
addLog(job, `Found ${items.length} files (${totalSizeMb} MB). Downloading...`);
|
|
|
|
// Phase 2: Download
|
|
const result = await downloadMegaFiles(items, outputDir, workers,
|
|
(msg) => addLog(job, msg),
|
|
(completed, errors, total) => {
|
|
job.progress.completed = completed;
|
|
job.progress.errors = errors;
|
|
job.progress.total = total;
|
|
},
|
|
() => job.cancelled,
|
|
(status) => {
|
|
job.paused = status.paused;
|
|
job.resumeAt = status.resumeAt;
|
|
}
|
|
);
|
|
|
|
addLog(job, `Done! ${result.completed} downloaded, ${result.errors} failed, ${result.skipped} skipped`);
|
|
} catch (err) {
|
|
addLog(job, `Error: ${err.message}`);
|
|
job.progress.errors++;
|
|
} finally {
|
|
job.running = false;
|
|
job.completedAt = new Date().toISOString();
|
|
pruneCompleted();
|
|
}
|
|
}
|
|
|
|
// --- yt-dlp Scrape ---
|
|
|
|
async function runYtdlpScrape(job) {
|
|
const config = job.config;
|
|
addLog(job, `Starting yt-dlp download: ${config.url}`);
|
|
addLog(job, `Quality: ${config.quality || 'best'}, Playlist: ${config.playlist ? 'yes' : 'no'}`);
|
|
|
|
try {
|
|
const result = await runYtdlp(
|
|
config,
|
|
(msg) => addLog(job, msg),
|
|
(completed, errors) => {
|
|
job.progress.completed = completed;
|
|
job.progress.errors += errors;
|
|
if (completed > job.progress.total) job.progress.total = completed;
|
|
},
|
|
() => job.cancelled
|
|
);
|
|
|
|
if (result.cancelled) {
|
|
addLog(job, 'Cancelled by user');
|
|
} else {
|
|
addLog(job, `Done! ${result.files} file${result.files !== 1 ? 's' : ''} downloaded`);
|
|
}
|
|
} catch (err) {
|
|
addLog(job, `Error: ${err.message}`);
|
|
job.progress.errors++;
|
|
} finally {
|
|
job.running = false;
|
|
job.completedAt = new Date().toISOString();
|
|
pruneCompleted();
|
|
}
|
|
}
|
|
|
|
// --- LeakGallery Scrape ---
|
|
|
|
async function runLeakGalleryScrape(job) {
|
|
const { url, pages, workers, delay, folderName } = job.config;
|
|
const outputDir = join(MEDIA_PATH, folderName);
|
|
mkdirSync(outputDir, { recursive: true });
|
|
|
|
addLog(job, `Starting leakgallery scrape: ${url}`);
|
|
addLog(job, `Pages: ${pages}, Workers: ${workers}, Delay: ${delay}ms`);
|
|
|
|
try {
|
|
const { username } = parseLeakGalleryUrl(url);
|
|
addLog(job, `Username: ${username}`);
|
|
|
|
// Phase 1: Collect all media
|
|
addLog(job, `Fetching up to ${pages} pages from API...`);
|
|
const items = await fetchLeakGalleryMedia(username, pages, delay,
|
|
(msg) => addLog(job, msg),
|
|
() => job.cancelled
|
|
);
|
|
|
|
if (job.cancelled) {
|
|
addLog(job, 'Cancelled by user');
|
|
return;
|
|
}
|
|
|
|
if (items.length === 0) {
|
|
addLog(job, 'No media found');
|
|
return;
|
|
}
|
|
|
|
job.progress.total = items.length;
|
|
addLog(job, `Found ${items.length} media items. Downloading...`);
|
|
|
|
// Phase 2: Download all media files
|
|
const result = await downloadLeakGalleryMedia(items, outputDir, workers,
|
|
(msg) => addLog(job, msg),
|
|
(completed, errors, total) => {
|
|
job.progress.completed = completed;
|
|
job.progress.errors = errors;
|
|
job.progress.total = total;
|
|
},
|
|
() => job.cancelled
|
|
);
|
|
|
|
addLog(job, `Done! ${result.completed} downloaded, ${result.errors} failed, ${result.skipped} skipped`);
|
|
} catch (err) {
|
|
addLog(job, `Error: ${err.message}`);
|
|
job.progress.errors++;
|
|
} finally {
|
|
job.running = false;
|
|
job.completedAt = new Date().toISOString();
|
|
pruneCompleted();
|
|
}
|
|
}
|
|
|
|
// --- Endpoints ---
|
|
|
|
router.post('/api/scrape/forum', (req, res) => {
|
|
const { url, folderName, startPage, endPage, delay, cookies, siteId, lastPageOnly } = req.body;
|
|
if (!url) return res.status(400).json({ error: 'URL is required' });
|
|
if (!folderName) return res.status(400).json({ error: 'Folder name is required' });
|
|
|
|
const config = {
|
|
url: url.includes('page-') ? url : `${url.replace(/\/$/, '')}/page-1`,
|
|
folderName,
|
|
startPage: parseInt(startPage) || 1,
|
|
endPage: parseInt(endPage) || 10,
|
|
delay: parseFloat(delay) || 1.0,
|
|
cookies: cookies || '',
|
|
siteId: siteId ? parseInt(siteId, 10) : null,
|
|
lastPageOnly: !!lastPageOnly,
|
|
};
|
|
|
|
const job = createJob('forum', config);
|
|
runForumScrape(job).catch(err => {
|
|
addLog(job, `Fatal error: ${err.message}`);
|
|
job.running = false;
|
|
job.completedAt = new Date().toISOString();
|
|
});
|
|
|
|
res.json({ jobId: job.id, message: 'Forum scrape started' });
|
|
});
|
|
|
|
router.post('/api/scrape/coomer', (req, res) => {
|
|
const { url, folderName, pages, workers } = req.body;
|
|
if (!url) return res.status(400).json({ error: 'URL is required' });
|
|
if (!folderName) return res.status(400).json({ error: 'Folder name is required' });
|
|
|
|
const config = {
|
|
url,
|
|
folderName,
|
|
pages: parseInt(pages) || 10,
|
|
workers: Math.min(Math.max(parseInt(workers) || 10, 1), 20),
|
|
};
|
|
|
|
const job = createJob('coomer', config);
|
|
runCoomerScrape(job).catch(err => {
|
|
addLog(job, `Fatal error: ${err.message}`);
|
|
job.running = false;
|
|
job.completedAt = new Date().toISOString();
|
|
});
|
|
|
|
res.json({ jobId: job.id, message: 'Coomer scrape started' });
|
|
});
|
|
|
|
router.post('/api/scrape/medialink', (req, res) => {
|
|
const { url, folderName, pages, workers, delay } = req.body;
|
|
if (!url) return res.status(400).json({ error: 'URL is required' });
|
|
if (!folderName) return res.status(400).json({ error: 'Folder name is required' });
|
|
|
|
const config = {
|
|
url,
|
|
folderName,
|
|
pages: parseInt(pages) || 50,
|
|
workers: Math.min(Math.max(parseInt(workers) || 3, 1), 10),
|
|
delay: parseInt(delay) || 500,
|
|
};
|
|
|
|
const job = createJob('medialink', config);
|
|
runMediaLinkScrape(job).catch(err => {
|
|
addLog(job, `Fatal error: ${err.message}`);
|
|
job.running = false;
|
|
job.completedAt = new Date().toISOString();
|
|
});
|
|
|
|
res.json({ jobId: job.id, message: 'MediaLink scrape started' });
|
|
});
|
|
|
|
router.post('/api/scrape/mega', (req, res) => {
|
|
const { url, folderName, workers } = req.body;
|
|
if (!url) return res.status(400).json({ error: 'URL is required' });
|
|
if (!folderName) return res.status(400).json({ error: 'Folder name is required' });
|
|
|
|
try {
|
|
parseMegaUrl(url);
|
|
} catch (err) {
|
|
return res.status(400).json({ error: err.message });
|
|
}
|
|
|
|
const config = {
|
|
url,
|
|
folderName,
|
|
workers: Math.min(Math.max(parseInt(workers) || 3, 1), 10),
|
|
};
|
|
|
|
const job = createJob('mega', config);
|
|
runMegaScrape(job).catch(err => {
|
|
addLog(job, `Fatal error: ${err.message}`);
|
|
job.running = false;
|
|
job.completedAt = new Date().toISOString();
|
|
});
|
|
|
|
res.json({ jobId: job.id, message: 'Mega scrape started' });
|
|
});
|
|
|
|
router.post('/api/scrape/ytdlp', (req, res) => {
|
|
const { url, quality, customFormat, embedMetadata, embedThumbnail, embedSubs,
|
|
writeSubs, subLangs, restrictFilenames, outputTemplate,
|
|
playlist, maxDownloads, concurrentFragments, rateLimit,
|
|
sponsorBlock, cookiesFile } = req.body;
|
|
if (!url) return res.status(400).json({ error: 'URL is required' });
|
|
|
|
const config = {
|
|
url,
|
|
quality: quality || 'best',
|
|
customFormat: customFormat || '',
|
|
embedMetadata: embedMetadata !== false,
|
|
embedThumbnail: embedThumbnail !== false,
|
|
embedSubs: embedSubs !== false,
|
|
writeSubs: writeSubs || false,
|
|
subLangs: subLangs || 'en',
|
|
restrictFilenames: restrictFilenames !== false,
|
|
outputTemplate: outputTemplate || '%(title)s.%(ext)s',
|
|
playlist: playlist || false,
|
|
maxDownloads: parseInt(maxDownloads) || 0,
|
|
concurrentFragments: Math.min(Math.max(parseInt(concurrentFragments) || 4, 1), 16),
|
|
rateLimit: rateLimit || '',
|
|
sponsorBlock: sponsorBlock || 'off',
|
|
cookiesFile: cookiesFile || '',
|
|
folderName: (() => {
|
|
try {
|
|
const u = new URL(url);
|
|
const path = u.pathname.replace(/^\//, '').replace(/\/$/, '');
|
|
return path ? `${u.hostname}/${path}`.slice(0, 60) : u.hostname;
|
|
} catch { return url.slice(0, 60); }
|
|
})(),
|
|
};
|
|
|
|
const job = createJob('ytdlp', config);
|
|
runYtdlpScrape(job).catch(err => {
|
|
addLog(job, `Fatal error: ${err.message}`);
|
|
job.running = false;
|
|
job.completedAt = new Date().toISOString();
|
|
});
|
|
|
|
res.json({ jobId: job.id, message: 'yt-dlp download started' });
|
|
});
|
|
|
|
router.post('/api/scrape/leakgallery', (req, res) => {
|
|
const { url, folderName, pages, workers, delay } = req.body;
|
|
if (!url) return res.status(400).json({ error: 'URL is required' });
|
|
if (!folderName) return res.status(400).json({ error: 'Folder name is required' });
|
|
|
|
try {
|
|
parseLeakGalleryUrl(url);
|
|
} catch (err) {
|
|
return res.status(400).json({ error: err.message });
|
|
}
|
|
|
|
const config = {
|
|
url,
|
|
folderName,
|
|
pages: parseInt(pages) || 100,
|
|
workers: Math.min(Math.max(parseInt(workers) || 3, 1), 10),
|
|
delay: parseInt(delay) || 300,
|
|
};
|
|
|
|
const job = createJob('leakgallery', config);
|
|
runLeakGalleryScrape(job).catch(err => {
|
|
addLog(job, `Fatal error: ${err.message}`);
|
|
job.running = false;
|
|
job.completedAt = new Date().toISOString();
|
|
});
|
|
|
|
res.json({ jobId: job.id, message: 'LeakGallery scrape started' });
|
|
});
|
|
|
|
router.get('/api/scrape/jobs', (_req, res) => {
|
|
const jobs = [...jobsMap.values()].map(jobToJson);
|
|
jobs.sort((a, b) => new Date(b.startedAt) - new Date(a.startedAt));
|
|
res.json(jobs);
|
|
});
|
|
|
|
router.get('/api/scrape/jobs/:jobId', (req, res) => {
|
|
const job = jobsMap.get(req.params.jobId);
|
|
if (!job) return res.status(404).json({ error: 'Job not found' });
|
|
res.json({ ...jobToJson(job), logs: job.logs });
|
|
});
|
|
|
|
router.post('/api/scrape/jobs/:jobId/cancel', (req, res) => {
|
|
const job = jobsMap.get(req.params.jobId);
|
|
if (!job) return res.status(404).json({ error: 'Job not found' });
|
|
if (!job.running) return res.status(400).json({ error: 'Job is not running' });
|
|
job.cancelled = true;
|
|
addLog(job, 'Cancel requested');
|
|
res.json({ message: 'Cancel requested' });
|
|
});
|
|
|
|
router.delete('/api/scrape/jobs/:jobId', (req, res) => {
|
|
const job = jobsMap.get(req.params.jobId);
|
|
if (!job) return res.status(404).json({ error: 'Job not found' });
|
|
job.cancelled = true;
|
|
job.running = false;
|
|
jobsMap.delete(req.params.jobId);
|
|
res.json({ message: 'Job removed' });
|
|
});
|
|
|
|
// Auto-detect max page for forum URLs
|
|
router.post('/api/scrape/forum/detect-pages', async (req, res) => {
|
|
const { url, cookies } = req.body;
|
|
if (!url) return res.status(400).json({ error: 'URL is required' });
|
|
const logs = [];
|
|
const maxPage = await detectMaxPage(url, (msg) => logs.push(msg), cookies);
|
|
res.json({ maxPage, logs });
|
|
});
|
|
|
|
// Search a forum site for threads matching a query, return preview images per thread
|
|
const SEARCH_SKIP_PATTERNS = ['avatar', 'smilie', 'emoji', 'icon', 'logo', 'button', 'sprite', 'badge', 'rank', 'star'];
|
|
const PREVIEW_IMG_EXTS = /\.(jpg|jpeg|png|webp|gif)(\?|$)/i;
|
|
|
|
router.post('/api/scrape/forum/search', async (req, res) => {
|
|
const { query, siteId = 2, maxThreads = 5, previewsPerThread = 4, titleOnly = true } = req.body;
|
|
if (!query) return res.status(400).json({ error: 'query is required' });
|
|
|
|
const site = getForumSiteById(siteId);
|
|
if (!site) return res.status(404).json({ error: `Forum site ${siteId} not found` });
|
|
if (!site.cookies) return res.status(400).json({ error: 'Forum site has no cookies — refresh first' });
|
|
|
|
const baseUrl = (site.base_url || 'https://simpcity.cr').replace(/\/$/, '');
|
|
const cookies = site.cookies;
|
|
|
|
let fsSession = null;
|
|
try {
|
|
fsSession = await fsCreateSession(baseUrl + '/');
|
|
|
|
// Step 1: GET search form to grab the XenForo CSRF token
|
|
const formRes = await fsGet(fsSession, baseUrl + '/search/', cookies);
|
|
if (formRes.status !== 200) {
|
|
return res.status(formRes.status).json({ error: `Search form fetch failed: HTTP ${formRes.status}` });
|
|
}
|
|
const xfMatch = formRes.html.match(/name="_xfToken"\s+value="([^"]+)"/);
|
|
if (!xfMatch) {
|
|
return res.status(503).json({ error: 'No _xfToken on search form — cookies likely expired. Refresh via /api/flaresolverr/refresh/' + siteId });
|
|
}
|
|
const xfToken = xfMatch[1];
|
|
|
|
// Step 2: POST the search; XenForo redirects to /search/<id>/ with results
|
|
const postBody = new URLSearchParams({
|
|
keywords: query,
|
|
'c[title_only]': titleOnly ? '1' : '',
|
|
'c[users]': '',
|
|
_xfToken: xfToken,
|
|
});
|
|
const postRes = await fsPost(fsSession, baseUrl + '/search/search', cookies, postBody.toString());
|
|
const html = postRes.html;
|
|
|
|
// Parse thread results from contentRow-title anchors (XenForo result layout)
|
|
const $ = cheerio.load(html);
|
|
const seen = new Set();
|
|
const threads = [];
|
|
$('h3.contentRow-title a[href*="/threads/"]').each((_, el) => {
|
|
const $a = $(el);
|
|
let href;
|
|
try { href = new URL($a.attr('href'), baseUrl).href; } catch { return; }
|
|
const m = href.match(/\/threads\/([^\/]+\.\d+)\//);
|
|
if (!m) return;
|
|
const threadRoot = `${baseUrl}/threads/${m[1]}/`;
|
|
if (seen.has(threadRoot)) return;
|
|
seen.add(threadRoot);
|
|
const title = $a.text().replace(/\s+/g, ' ').trim();
|
|
if (!title || title.length < 3) return;
|
|
threads.push({ threadUrl: threadRoot, title });
|
|
});
|
|
|
|
if (threads.length === 0) {
|
|
return res.json({ query, results: [] });
|
|
}
|
|
|
|
// For top N threads, fetch last page and pull preview image URLs
|
|
const topThreads = threads.slice(0, maxThreads);
|
|
const results = [];
|
|
for (const t of topThreads) {
|
|
try {
|
|
const maxPage = await detectMaxPage(t.threadUrl, () => {}, cookies, '', fsSession);
|
|
const lastPageUrl = maxPage && maxPage > 1 ? `${t.threadUrl}page-${maxPage}` : t.threadUrl;
|
|
const pageRes = await fsGet(fsSession, lastPageUrl, cookies);
|
|
if (pageRes.status !== 200) {
|
|
results.push({ ...t, lastPageUrl, lastPageNum: maxPage || 1, previews: [], error: `HTTP ${pageRes.status}` });
|
|
continue;
|
|
}
|
|
const $p = cheerio.load(pageRes.html);
|
|
const imgUrls = [];
|
|
$p('.message-body img, .bbWrapper img').each((_, el) => {
|
|
const $img = $p(el);
|
|
const src = $img.attr('src') || $img.attr('data-src') || $img.attr('data-url');
|
|
if (!src) return;
|
|
let absSrc;
|
|
try { absSrc = new URL(src, lastPageUrl).href; } catch { return; }
|
|
const lower = absSrc.toLowerCase();
|
|
if (SEARCH_SKIP_PATTERNS.some(p => lower.includes(p))) return;
|
|
const $parentA = $img.closest('a');
|
|
if ($parentA.length && $parentA.attr('href')) {
|
|
try {
|
|
const aHref = new URL($parentA.attr('href'), lastPageUrl).href;
|
|
if (PREVIEW_IMG_EXTS.test(aHref)) { imgUrls.push(aHref); return; }
|
|
} catch {}
|
|
}
|
|
let upgraded = absSrc.replace('.th.', '.').replace('.md.', '.');
|
|
if (PREVIEW_IMG_EXTS.test(upgraded) || /\/data\/attachments|proxy\.php/.test(upgraded)) {
|
|
imgUrls.push(upgraded);
|
|
}
|
|
});
|
|
const unique = [...new Set(imgUrls)];
|
|
const previews = unique.slice(-previewsPerThread);
|
|
results.push({ ...t, lastPageUrl, lastPageNum: maxPage || 1, previews });
|
|
} catch (err) {
|
|
results.push({ ...t, previews: [], error: err.message });
|
|
}
|
|
}
|
|
|
|
res.json({ query, results });
|
|
} catch (err) {
|
|
console.error('[scrape/forum/search]', err);
|
|
res.status(500).json({ error: err.message });
|
|
} finally {
|
|
if (fsSession) await fsDestroySession(fsSession);
|
|
}
|
|
});
|
|
|
|
// --- Forum Sites CRUD ---
|
|
|
|
router.get('/api/scrape/forum-sites', (_req, res) => {
|
|
res.json(getForumSites());
|
|
});
|
|
|
|
router.post('/api/scrape/forum-sites', (req, res) => {
|
|
const { name, baseUrl, cookies, username, password } = req.body;
|
|
if (!name) return res.status(400).json({ error: 'Name is required' });
|
|
const id = createForumSite(name, baseUrl, cookies, username, password);
|
|
res.json(getForumSiteById(id));
|
|
});
|
|
|
|
router.put('/api/scrape/forum-sites/:id', (req, res) => {
|
|
const id = parseInt(req.params.id, 10);
|
|
const site = getForumSiteById(id);
|
|
if (!site) return res.status(404).json({ error: 'Forum site not found' });
|
|
const { name, baseUrl, cookies, username, password } = req.body;
|
|
const fields = {};
|
|
if (name !== undefined) fields.name = name;
|
|
if (baseUrl !== undefined) fields.base_url = baseUrl;
|
|
if (cookies !== undefined) fields.cookies = cookies;
|
|
if (username !== undefined) fields.username = username;
|
|
if (password !== undefined) fields.password = password;
|
|
updateForumSite(id, fields);
|
|
res.json(getForumSiteById(id));
|
|
});
|
|
|
|
router.delete('/api/scrape/forum-sites/:id', (req, res) => {
|
|
const id = parseInt(req.params.id, 10);
|
|
deleteForumSite(id);
|
|
res.json({ ok: true });
|
|
});
|
|
|
|
// --- Auto-scrape CRUD ---
|
|
|
|
router.get('/api/scrape/auto', (_req, res) => {
|
|
res.json(getAutoScrapeJobs());
|
|
});
|
|
|
|
router.post('/api/scrape/auto', (req, res) => {
|
|
const { type, url, folderName, config } = req.body;
|
|
if (!type || !url || !folderName || !config) {
|
|
return res.status(400).json({ error: 'type, url, folderName, and config are required' });
|
|
}
|
|
addAutoScrapeJob(type, url, folderName, config);
|
|
res.json({ ok: true });
|
|
});
|
|
|
|
router.delete('/api/scrape/auto/:id', (req, res) => {
|
|
removeAutoScrapeJob(parseInt(req.params.id));
|
|
res.json({ ok: true });
|
|
});
|
|
|
|
export function getActiveScrapeCount() {
|
|
let count = 0;
|
|
for (const job of jobsMap.values()) {
|
|
if (job.running) count++;
|
|
}
|
|
return count;
|
|
}
|
|
|
|
export function getActiveScrapesList() {
|
|
const list = [];
|
|
for (const job of jobsMap.values()) {
|
|
if (job.running) {
|
|
list.push({ type: job.type, folderName: job.folderName, progress: job.progress });
|
|
}
|
|
}
|
|
return list;
|
|
}
|
|
|
|
export { runForumScrape, runCoomerScrape, runMediaLinkScrape, runMegaScrape, runYtdlpScrape, runLeakGalleryScrape, createJob };
|
|
export default router;
|