Route SimpCity forum scraping through FlareSolverr + add turbo.cr resolver

DDoS-Guard now binds session cookies to the issuing browser's fingerprint, so
direct Node fetch returns 403 even with valid cookies. Page HTML for any
forum_site with stored cookies is now fetched via a FlareSolverr browser
session opened once per scrape job.

- Hybrid cookie refresh: FlareSolverr clears the DDoS-Guard captcha, those
  cookies seed undetected_chromedriver, Turnstile auto-solves in the real
  browser, login form submits, final cookies + browser UA persist to forum_sites
- Per-site user_agent column so subsequent scraper requests match the UA the
  cookies were issued for (DDoS-Guard rejects UA mismatches)
- XenForo search rewritten as proper CSRF POST /search/search → results page
  parse, replacing the broken ?q=... GET that only returned the search form
- Pagination regex fallback in detectMaxPage catches XenForo pages that
  cheerio's class-based selectors miss
- New scrapers/turbo.js handles turbo.cr /embed/ and /a/ URLs by rendering
  the page via FlareSolverr and grabbing the signed mp4 from the resolved
  <video src> attribute (gallery-dl can't extract these — obfuscated WASM)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Trey T
2026-04-29 19:33:54 -05:00
parent 236f36aae6
commit aa4f1157d1
6 changed files with 589 additions and 78 deletions
+144 -6
View File
@@ -1,8 +1,9 @@
import { Router } from 'express';
import { mkdirSync } from 'fs';
import { join } from 'path';
import { scrapeForumPage, getPageUrl, detectMaxPage, CookieExpiredError } from './scrapers/forum.js';
import { refreshForumCookies } from './flaresolverr.js';
import * as cheerio from 'cheerio';
import { scrapeForumPage, getPageUrl, detectMaxPage, CookieExpiredError, fixCookieIp, FORUM_UA } from './scrapers/forum.js';
import { refreshForumCookies, fsCreateSession, fsDestroySession, fsGet, fsPost } from './flaresolverr.js';
import { parseUserUrl, fetchAllPosts, fetchSearchPosts, downloadFiles } from './scrapers/coomer.js';
import { parseMediaUrl, fetchAllMedia, fetchAllMediaFromHtml, downloadMedia } from './scrapers/medialink.js';
import { parseMegaUrl, listAllFiles, downloadMegaFiles } from './scrapers/mega.js';
@@ -75,14 +76,17 @@ function jobToJson(job) {
async function runForumScrape(job) {
let { url, startPage, endPage, delay, folderName, siteId, lastPageOnly } = job.config;
let { cookies } = job.config;
let userAgent = job.config.userAgent || '';
// Load cookies from forum site record if siteId provided and no cookies passed
if (!cookies && siteId) {
const site = getForumSiteById(siteId);
if (site && site.cookies) {
cookies = site.cookies;
userAgent = site.user_agent || userAgent;
job.config.cookies = cookies;
addLog(job, `Loaded cookies from forum site: ${site.name}`);
job.config.userAgent = userAgent;
addLog(job, `Loaded cookies from forum site: ${site.name}${userAgent ? ` (UA pinned)` : ''}`);
}
}
@@ -92,10 +96,24 @@ async function runForumScrape(job) {
const downloadedSet = new Set();
let totalImages = 0;
// When a siteId is in play, page HTML must be fetched through FlareSolverr —
// direct fetch hits DDoS-Guard's browser-fingerprint check and gets 403.
let fsSession = null;
if (siteId && cookies) {
try {
const baseHost = new URL(url).origin;
addLog(job, `Opening FlareSolverr session for ${baseHost}...`);
fsSession = await fsCreateSession(baseHost + '/');
addLog(job, `FlareSolverr session ready (${fsSession.slice(0, 8)}...)`);
} catch (e) {
addLog(job, `FlareSolverr session failed (${e.message}) — falling back to direct fetch`);
}
}
// If lastPageOnly, detect the last page and only scrape that
if (lastPageOnly) {
addLog(job, 'Detecting last page...');
const maxPage = await detectMaxPage(url, (msg) => addLog(job, msg), cookies);
const maxPage = await detectMaxPage(url, (msg) => addLog(job, msg), cookies, userAgent, fsSession);
if (maxPage) {
startPage = maxPage;
endPage = maxPage;
@@ -122,15 +140,18 @@ async function runForumScrape(job) {
let count;
try {
count = await scrapeForumPage(pageUrl, outputDir, downloadedSet, (msg) => addLog(job, msg), cookies);
count = await scrapeForumPage(pageUrl, outputDir, downloadedSet, (msg) => addLog(job, msg), cookies, userAgent, fsSession);
} catch (err) {
if (err instanceof CookieExpiredError && siteId) {
addLog(job, `Cookie expired (HTTP ${err.statusCode}) — attempting auto-refresh via FlareSolverr...`);
try {
cookies = await refreshForumCookies(siteId);
const refreshed = getForumSiteById(siteId);
userAgent = refreshed?.user_agent || userAgent;
job.config.cookies = cookies;
job.config.userAgent = userAgent;
addLog(job, 'Cookies refreshed successfully — retrying page...');
count = await scrapeForumPage(pageUrl, outputDir, downloadedSet, (msg) => addLog(job, msg), cookies);
count = await scrapeForumPage(pageUrl, outputDir, downloadedSet, (msg) => addLog(job, msg), cookies, userAgent, fsSession);
} catch (refreshErr) {
addLog(job, `Cookie refresh failed: ${refreshErr.message}`);
addLog(job, 'Stopping scrape — fix credentials or refresh cookies manually');
@@ -156,6 +177,10 @@ async function runForumScrape(job) {
addLog(job, `Error: ${err.message}`);
job.progress.errors++;
} finally {
if (fsSession) {
await fsDestroySession(fsSession);
addLog(job, `FlareSolverr session closed`);
}
job.running = false;
job.completedAt = new Date().toISOString();
addLog(job, `Done! ${totalImages} files saved to ${folderName}/`);
@@ -653,6 +678,119 @@ router.post('/api/scrape/forum/detect-pages', async (req, res) => {
res.json({ maxPage, logs });
});
// Search a forum site for threads matching a query, return preview images per thread
const SEARCH_SKIP_PATTERNS = ['avatar', 'smilie', 'emoji', 'icon', 'logo', 'button', 'sprite', 'badge', 'rank', 'star'];
const PREVIEW_IMG_EXTS = /\.(jpg|jpeg|png|webp|gif)(\?|$)/i;
router.post('/api/scrape/forum/search', async (req, res) => {
const { query, siteId = 2, maxThreads = 5, previewsPerThread = 4, titleOnly = true } = req.body;
if (!query) return res.status(400).json({ error: 'query is required' });
const site = getForumSiteById(siteId);
if (!site) return res.status(404).json({ error: `Forum site ${siteId} not found` });
if (!site.cookies) return res.status(400).json({ error: 'Forum site has no cookies — refresh first' });
const baseUrl = (site.base_url || 'https://simpcity.cr').replace(/\/$/, '');
const cookies = site.cookies;
let fsSession = null;
try {
fsSession = await fsCreateSession(baseUrl + '/');
// Step 1: GET search form to grab the XenForo CSRF token
const formRes = await fsGet(fsSession, baseUrl + '/search/', cookies);
if (formRes.status !== 200) {
return res.status(formRes.status).json({ error: `Search form fetch failed: HTTP ${formRes.status}` });
}
const xfMatch = formRes.html.match(/name="_xfToken"\s+value="([^"]+)"/);
if (!xfMatch) {
return res.status(503).json({ error: 'No _xfToken on search form — cookies likely expired. Refresh via /api/flaresolverr/refresh/' + siteId });
}
const xfToken = xfMatch[1];
// Step 2: POST the search; XenForo redirects to /search/<id>/ with results
const postBody = new URLSearchParams({
keywords: query,
'c[title_only]': titleOnly ? '1' : '',
'c[users]': '',
_xfToken: xfToken,
});
const postRes = await fsPost(fsSession, baseUrl + '/search/search', cookies, postBody.toString());
const html = postRes.html;
// Parse thread results from contentRow-title anchors (XenForo result layout)
const $ = cheerio.load(html);
const seen = new Set();
const threads = [];
$('h3.contentRow-title a[href*="/threads/"]').each((_, el) => {
const $a = $(el);
let href;
try { href = new URL($a.attr('href'), baseUrl).href; } catch { return; }
const m = href.match(/\/threads\/([^\/]+\.\d+)\//);
if (!m) return;
const threadRoot = `${baseUrl}/threads/${m[1]}/`;
if (seen.has(threadRoot)) return;
seen.add(threadRoot);
const title = $a.text().replace(/\s+/g, ' ').trim();
if (!title || title.length < 3) return;
threads.push({ threadUrl: threadRoot, title });
});
if (threads.length === 0) {
return res.json({ query, results: [] });
}
// For top N threads, fetch last page and pull preview image URLs
const topThreads = threads.slice(0, maxThreads);
const results = [];
for (const t of topThreads) {
try {
const maxPage = await detectMaxPage(t.threadUrl, () => {}, cookies, '', fsSession);
const lastPageUrl = maxPage && maxPage > 1 ? `${t.threadUrl}page-${maxPage}` : t.threadUrl;
const pageRes = await fsGet(fsSession, lastPageUrl, cookies);
if (pageRes.status !== 200) {
results.push({ ...t, lastPageUrl, lastPageNum: maxPage || 1, previews: [], error: `HTTP ${pageRes.status}` });
continue;
}
const $p = cheerio.load(pageRes.html);
const imgUrls = [];
$p('.message-body img, .bbWrapper img').each((_, el) => {
const $img = $p(el);
const src = $img.attr('src') || $img.attr('data-src') || $img.attr('data-url');
if (!src) return;
let absSrc;
try { absSrc = new URL(src, lastPageUrl).href; } catch { return; }
const lower = absSrc.toLowerCase();
if (SEARCH_SKIP_PATTERNS.some(p => lower.includes(p))) return;
const $parentA = $img.closest('a');
if ($parentA.length && $parentA.attr('href')) {
try {
const aHref = new URL($parentA.attr('href'), lastPageUrl).href;
if (PREVIEW_IMG_EXTS.test(aHref)) { imgUrls.push(aHref); return; }
} catch {}
}
let upgraded = absSrc.replace('.th.', '.').replace('.md.', '.');
if (PREVIEW_IMG_EXTS.test(upgraded) || /\/data\/attachments|proxy\.php/.test(upgraded)) {
imgUrls.push(upgraded);
}
});
const unique = [...new Set(imgUrls)];
const previews = unique.slice(-previewsPerThread);
results.push({ ...t, lastPageUrl, lastPageNum: maxPage || 1, previews });
} catch (err) {
results.push({ ...t, previews: [], error: err.message });
}
}
res.json({ query, results });
} catch (err) {
console.error('[scrape/forum/search]', err);
res.status(500).json({ error: err.message });
} finally {
if (fsSession) await fsDestroySession(fsSession);
}
});
// --- Forum Sites CRUD ---
router.get('/api/scrape/forum-sites', (_req, res) => {