Route SimpCity forum scraping through FlareSolverr + add turbo.cr resolver

DDoS-Guard now binds session cookies to the issuing browser's fingerprint, so
direct Node fetch returns 403 even with valid cookies. Page HTML for any
forum_site with stored cookies is now fetched via a FlareSolverr browser
session opened once per scrape job.

- Hybrid cookie refresh: FlareSolverr clears the DDoS-Guard captcha, those
  cookies seed undetected_chromedriver, Turnstile auto-solves in the real
  browser, login form submits, final cookies + browser UA persist to forum_sites
- Per-site user_agent column so subsequent scraper requests match the UA the
  cookies were issued for (DDoS-Guard rejects UA mismatches)
- XenForo search rewritten as proper CSRF POST /search/search → results page
  parse, replacing the broken ?q=... GET that only returned the search form
- Pagination regex fallback in detectMaxPage catches XenForo pages that
  cheerio's class-based selectors miss
- New scrapers/turbo.js handles turbo.cr /embed/ and /a/ URLs by rendering
  the page via FlareSolverr and grabbing the signed mp4 from the resolved
  <video src> attribute (gallery-dl can't extract these — obfuscated WASM)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Trey T
2026-04-29 19:33:54 -05:00
parent 236f36aae6
commit aa4f1157d1
6 changed files with 589 additions and 78 deletions
+64 -25
View File
@@ -5,6 +5,8 @@ import { pipeline } from 'stream/promises';
import { execFile } from 'child_process';
import { promisify } from 'util';
import { upsertMediaFile } from '../db.js';
import { fsGet } from '../flaresolverr.js';
import { isTurboUrl, downloadTurbo } from './turbo.js';
const execFileAsync = promisify(execFile);
@@ -20,11 +22,13 @@ export class CookieExpiredError extends Error {
}
// Replace DDoS-Guard __ddg9_ cookie IP with server's IP so cookies work from any browser
function fixCookieIp(cookies) {
export function fixCookieIp(cookies) {
if (!cookies) return cookies;
return cookies.replace(/__ddg9_=[^;]+/, `__ddg9_=${SERVER_IP}`);
}
export const FORUM_UA = UA;
const IMAGE_EXTS = new Set(['.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp', '.tiff']);
const VIDEO_EXTS = new Set(['.mp4', '.mov', '.avi', '.webm', '.mkv', '.m4v', '.wmv', '.flv', '.ts']);
const SKIP_PATTERNS = ['avatar', 'smilie', 'emoji', 'icon', 'logo', 'button', 'sprite', 'badge', 'rank', 'star', 'dc_thumbnails'];
@@ -69,13 +73,20 @@ export function getPageUrl(baseUrl, pageNum) {
return url.split('#')[0];
}
export async function detectMaxPage(baseUrl, logFn, cookies) {
export async function detectMaxPage(baseUrl, logFn, cookies, userAgent, fsSession) {
try {
const headers = { 'User-Agent': UA };
if (cookies) headers['Cookie'] = fixCookieIp(cookies);
const resp = await fetch(baseUrl, { headers, signal: AbortSignal.timeout(15000) });
if (!resp.ok) return null;
const html = await resp.text();
let html;
if (fsSession) {
const r = await fsGet(fsSession, baseUrl, cookies);
if (r.status !== 200) return null;
html = r.html;
} else {
const headers = { 'User-Agent': userAgent || UA };
if (cookies) headers['Cookie'] = fixCookieIp(cookies);
const resp = await fetch(baseUrl, { headers, signal: AbortSignal.timeout(15000) });
if (!resp.ok) return null;
html = await resp.text();
}
const $ = cheerio.load(html);
let maxPage = 1;
@@ -91,6 +102,17 @@ export async function detectMaxPage(baseUrl, logFn, cookies) {
if (n > maxPage && n < 10000) maxPage = n;
}
});
// Final fallback: scan raw HTML for any page-N references (XenForo's
// serialized pagination sometimes only appears in href attributes that
// cheerio's class-based selectors miss).
if (maxPage === 1) {
const re = /page-(\d+)/g;
let m;
while ((m = re.exec(html)) !== null) {
const n = parseInt(m[1], 10);
if (n > maxPage && n < 10000) maxPage = n;
}
}
if (maxPage > 1) {
logFn(`Detected ${maxPage} pages`);
@@ -123,7 +145,7 @@ function tryFullSizeUrl(thumbUrl) {
return candidates;
}
async function downloadImage(url, outputDir, downloadedSet, logFn, cookies) {
async function downloadImage(url, outputDir, downloadedSet, logFn, cookies, userAgent) {
if (downloadedSet.has(url)) return false;
if (!isImageUrl(url)) return false;
const lower = url.toLowerCase();
@@ -142,7 +164,7 @@ async function downloadImage(url, outputDir, downloadedSet, logFn, cookies) {
}
try {
const dlHeaders = { 'User-Agent': UA };
const dlHeaders = { 'User-Agent': userAgent || UA };
if (cookies) dlHeaders['Cookie'] = fixCookieIp(cookies);
const resp = await fetch(url, { headers: dlHeaders, signal: AbortSignal.timeout(30000) });
if (!resp.ok) {
@@ -171,10 +193,16 @@ async function downloadImage(url, outputDir, downloadedSet, logFn, cookies) {
}
// Use gallery-dl to download from external hosts (bunkr, saint, cyberdrop, etc.)
async function downloadFromExternalHost(url, outputDir, downloadedSet, logFn) {
async function downloadFromExternalHost(url, outputDir, downloadedSet, logFn, userAgent, fsSession) {
if (downloadedSet.has(url)) return 0;
downloadedSet.add(url);
// turbo.cr uses an obfuscated WASM player — gallery-dl can't extract the
// signed mp4 URL. Resolve via FlareSolverr (renders JS) instead.
if (isTurboUrl(url)) {
return await downloadTurbo(url, outputDir, logFn, userAgent, fsSession);
}
logFn(`Resolving via gallery-dl: ${url}`);
try {
@@ -240,23 +268,34 @@ async function downloadFromExternalHost(url, outputDir, downloadedSet, logFn) {
}
}
export async function scrapeForumPage(pageUrl, outputDir, downloadedSet, logFn, cookies) {
logFn(`Fetching page: ${pageUrl}`);
export async function scrapeForumPage(pageUrl, outputDir, downloadedSet, logFn, cookies, userAgent, fsSession) {
logFn(`Fetching page: ${pageUrl}${fsSession ? ' [via FlareSolverr]' : ''}`);
let html;
try {
const headers = { 'User-Agent': UA };
if (cookies) headers['Cookie'] = fixCookieIp(cookies);
const resp = await fetch(pageUrl, { headers, signal: AbortSignal.timeout(15000) });
if (!resp.ok) {
// SimpCity returns 404 for expired sessions, 403 for blocked
if (cookies && (resp.status === 404 || resp.status === 403)) {
throw new CookieExpiredError(resp.status);
if (fsSession) {
const r = await fsGet(fsSession, pageUrl, cookies);
if (r.status !== 200) {
if (cookies && (r.status === 404 || r.status === 403)) {
throw new CookieExpiredError(r.status);
}
logFn(`Failed to fetch page (${r.status})`);
return 0;
}
logFn(`Failed to fetch page (${resp.status})`);
return 0;
html = r.html;
} else {
const headers = { 'User-Agent': userAgent || UA };
if (cookies) headers['Cookie'] = fixCookieIp(cookies);
const resp = await fetch(pageUrl, { headers, signal: AbortSignal.timeout(15000) });
if (!resp.ok) {
if (cookies && (resp.status === 404 || resp.status === 403)) {
throw new CookieExpiredError(resp.status);
}
logFn(`Failed to fetch page (${resp.status})`);
return 0;
}
html = await resp.text();
}
html = await resp.text();
} catch (err) {
if (err instanceof CookieExpiredError) throw err;
logFn(`Failed to fetch page: ${err.message}`);
@@ -359,14 +398,14 @@ export async function scrapeForumPage(pageUrl, outputDir, downloadedSet, logFn,
// Download images
for (const imgUrl of imageUrls) {
if (await downloadImage(imgUrl, outputDir, downloadedSet, logFn, cookies)) {
if (await downloadImage(imgUrl, outputDir, downloadedSet, logFn, cookies, userAgent)) {
count++;
}
}
// Download from external hosts via gallery-dl
// Download from external hosts (turbo.cr handled via FlareSolverr; rest via gallery-dl)
for (const extUrl of externalUrls) {
const dlCount = await downloadFromExternalHost(extUrl, outputDir, downloadedSet, logFn);
const dlCount = await downloadFromExternalHost(extUrl, outputDir, downloadedSet, logFn, userAgent, fsSession);
count += dlCount;
}
+138
View File
@@ -0,0 +1,138 @@
import { writeFileSync, existsSync } from 'fs';
import { join, basename } from 'path';
import { fsCreateSession, fsDestroySession, fsGet } from '../flaresolverr.js';
import { upsertMediaFile } from '../db.js';
const TURBO_HOST_RE = /^https?:\/\/(?:www\.)?turbo\.\w+\//i;
const TURBO_BASE = 'https://turbo.cr';
const DEFAULT_UA = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36';
export function isTurboUrl(url) {
return TURBO_HOST_RE.test(url);
}
function unescapeHtml(s) {
return s.replace(/&amp;/g, '&').replace(/&lt;/g, '<').replace(/&gt;/g, '>').replace(/&quot;/g, '"');
}
function extractMp4FromHtml(html) {
// Plyr renders the resolved URL into <video src="..."> after WASM runs
const m = html.match(/<video[^>]+\bsrc=["']([^"']+\.mp4[^"']*)["']/i);
if (m) return unescapeHtml(m[1]);
// Fallback: any direct turbocdn mp4
const m2 = html.match(/https?:\/\/[^"'\s<>]*turbocdn[^"'\s<>]*\.mp4[^"'\s<>]*/i);
if (m2) return unescapeHtml(m2[0]);
return null;
}
function turboFilename(mp4Url, fallbackId) {
try {
const u = new URL(mp4Url);
const fn = u.searchParams.get('fn');
if (fn) return fn;
const base = basename(u.pathname);
if (base) return base;
} catch {}
return (fallbackId || 'turbo') + '.mp4';
}
async function downloadVideo(url, dest, ua) {
try {
const r = await fetch(url, {
headers: { 'User-Agent': ua, 'Referer': TURBO_BASE + '/' },
signal: AbortSignal.timeout(600000), // 10 min for big videos
});
if (!r.ok) return { ok: false, status: r.status };
const buf = Buffer.from(await r.arrayBuffer());
if (buf.length < 10000) return { ok: false, reason: 'too small ' + buf.length };
writeFileSync(dest, buf);
return { ok: true, size: buf.length };
} catch (e) {
return { ok: false, error: e.message };
}
}
async function resolveEmbed(sessionId, embedUrl) {
const r = await fsGet(sessionId, embedUrl, '');
if (r.status !== 200) return null;
return extractMp4FromHtml(r.html);
}
async function resolveAlbumIds(sessionId, albumUrl) {
const r = await fsGet(sessionId, albumUrl, '');
if (r.status !== 200) return [];
// Album page: each video tile has data-id="<videoId>"
const ids = new Set();
const re = /data-id=["']([A-Za-z0-9_-]{6,})["']/g;
let m;
while ((m = re.exec(r.html)) !== null) ids.add(m[1]);
// Also handle direct embed links if present
const re2 = /turbo\.[a-z]+\/embed\/([A-Za-z0-9_-]+)/g;
while ((m = re2.exec(r.html)) !== null) ids.add(m[1]);
return [...ids];
}
/**
* Resolve a turbo.cr URL (embed or album) and download all videos found.
* Pass an existing FlareSolverr sessionId to reuse it across many calls;
* otherwise one is created and destroyed per call.
*
* Returns the count of videos successfully downloaded.
*/
export async function downloadTurbo(url, outputDir, logFn, userAgent, fsSession) {
const ua = userAgent || DEFAULT_UA;
const folderName = basename(outputDir);
let ownSession = false;
let sessionId = fsSession;
try {
if (!sessionId) {
sessionId = await fsCreateSession(TURBO_BASE + '/');
ownSession = true;
}
let embedIds = [];
let mEmbed = url.match(/turbo\.[a-z]+\/embed\/([A-Za-z0-9_-]+)/i);
if (mEmbed) {
embedIds = [mEmbed[1]];
} else if (/\/a\//i.test(url)) {
logFn(`turbo: resolving album ${url}`);
embedIds = await resolveAlbumIds(sessionId, url);
logFn(`turbo: album has ${embedIds.length} video(s)`);
} else {
logFn(`turbo: unrecognized URL ${url}`);
return 0;
}
let count = 0;
for (const id of embedIds) {
const embedUrl = `${TURBO_BASE}/embed/${id}`;
try {
const mp4 = await resolveEmbed(sessionId, embedUrl);
if (!mp4) {
logFn(`turbo: could not resolve mp4 for ${id}`);
continue;
}
const filename = turboFilename(mp4, id);
const dest = join(outputDir, filename);
if (existsSync(dest)) {
logFn(`turbo: already have ${filename}`);
continue;
}
const dl = await downloadVideo(mp4, dest, ua);
if (dl.ok) {
try { upsertMediaFile(folderName, filename, 'video', dl.size, Date.now(), null); } catch {}
logFn(`Downloaded: ${filename} (${(dl.size / (1024 * 1024)).toFixed(1)} MB) [video]`);
count++;
} else {
logFn(`turbo: download failed ${filename} - ${dl.status || dl.error || dl.reason}`);
}
} catch (e) {
logFn(`turbo: error for ${id}: ${e.message}`);
}
}
return count;
} finally {
if (ownSession && sessionId) await fsDestroySession(sessionId);
}
}