Route SimpCity forum scraping through FlareSolverr + add turbo.cr resolver

DDoS-Guard now binds session cookies to the issuing browser's fingerprint, so
direct Node fetch returns 403 even with valid cookies. Page HTML for any
forum_site with stored cookies is now fetched via a FlareSolverr browser
session opened once per scrape job.

- Hybrid cookie refresh: FlareSolverr clears the DDoS-Guard captcha, those
  cookies seed undetected_chromedriver, Turnstile auto-solves in the real
  browser, login form submits, final cookies + browser UA persist to forum_sites
- Per-site user_agent column so subsequent scraper requests match the UA the
  cookies were issued for (DDoS-Guard rejects UA mismatches)
- XenForo search rewritten as proper CSRF POST /search/search → results page
  parse, replacing the broken ?q=... GET that only returned the search form
- Pagination regex fallback in detectMaxPage catches XenForo pages that
  cheerio's class-based selectors miss
- New scrapers/turbo.js handles turbo.cr /embed/ and /a/ URLs by rendering
  the page via FlareSolverr and grabbing the signed mp4 from the resolved
  <video src> attribute (gallery-dl can't extract these — obfuscated WASM)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Trey T
2026-04-29 19:33:54 -05:00
parent 236f36aae6
commit aa4f1157d1
6 changed files with 589 additions and 78 deletions
+64 -25
View File
@@ -5,6 +5,8 @@ import { pipeline } from 'stream/promises';
import { execFile } from 'child_process';
import { promisify } from 'util';
import { upsertMediaFile } from '../db.js';
import { fsGet } from '../flaresolverr.js';
import { isTurboUrl, downloadTurbo } from './turbo.js';
const execFileAsync = promisify(execFile);
@@ -20,11 +22,13 @@ export class CookieExpiredError extends Error {
}
// Replace DDoS-Guard __ddg9_ cookie IP with server's IP so cookies work from any browser
function fixCookieIp(cookies) {
export function fixCookieIp(cookies) {
if (!cookies) return cookies;
return cookies.replace(/__ddg9_=[^;]+/, `__ddg9_=${SERVER_IP}`);
}
export const FORUM_UA = UA;
const IMAGE_EXTS = new Set(['.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp', '.tiff']);
const VIDEO_EXTS = new Set(['.mp4', '.mov', '.avi', '.webm', '.mkv', '.m4v', '.wmv', '.flv', '.ts']);
const SKIP_PATTERNS = ['avatar', 'smilie', 'emoji', 'icon', 'logo', 'button', 'sprite', 'badge', 'rank', 'star', 'dc_thumbnails'];
@@ -69,13 +73,20 @@ export function getPageUrl(baseUrl, pageNum) {
return url.split('#')[0];
}
export async function detectMaxPage(baseUrl, logFn, cookies) {
export async function detectMaxPage(baseUrl, logFn, cookies, userAgent, fsSession) {
try {
const headers = { 'User-Agent': UA };
if (cookies) headers['Cookie'] = fixCookieIp(cookies);
const resp = await fetch(baseUrl, { headers, signal: AbortSignal.timeout(15000) });
if (!resp.ok) return null;
const html = await resp.text();
let html;
if (fsSession) {
const r = await fsGet(fsSession, baseUrl, cookies);
if (r.status !== 200) return null;
html = r.html;
} else {
const headers = { 'User-Agent': userAgent || UA };
if (cookies) headers['Cookie'] = fixCookieIp(cookies);
const resp = await fetch(baseUrl, { headers, signal: AbortSignal.timeout(15000) });
if (!resp.ok) return null;
html = await resp.text();
}
const $ = cheerio.load(html);
let maxPage = 1;
@@ -91,6 +102,17 @@ export async function detectMaxPage(baseUrl, logFn, cookies) {
if (n > maxPage && n < 10000) maxPage = n;
}
});
// Final fallback: scan raw HTML for any page-N references (XenForo's
// serialized pagination sometimes only appears in href attributes that
// cheerio's class-based selectors miss).
if (maxPage === 1) {
const re = /page-(\d+)/g;
let m;
while ((m = re.exec(html)) !== null) {
const n = parseInt(m[1], 10);
if (n > maxPage && n < 10000) maxPage = n;
}
}
if (maxPage > 1) {
logFn(`Detected ${maxPage} pages`);
@@ -123,7 +145,7 @@ function tryFullSizeUrl(thumbUrl) {
return candidates;
}
async function downloadImage(url, outputDir, downloadedSet, logFn, cookies) {
async function downloadImage(url, outputDir, downloadedSet, logFn, cookies, userAgent) {
if (downloadedSet.has(url)) return false;
if (!isImageUrl(url)) return false;
const lower = url.toLowerCase();
@@ -142,7 +164,7 @@ async function downloadImage(url, outputDir, downloadedSet, logFn, cookies) {
}
try {
const dlHeaders = { 'User-Agent': UA };
const dlHeaders = { 'User-Agent': userAgent || UA };
if (cookies) dlHeaders['Cookie'] = fixCookieIp(cookies);
const resp = await fetch(url, { headers: dlHeaders, signal: AbortSignal.timeout(30000) });
if (!resp.ok) {
@@ -171,10 +193,16 @@ async function downloadImage(url, outputDir, downloadedSet, logFn, cookies) {
}
// Use gallery-dl to download from external hosts (bunkr, saint, cyberdrop, etc.)
async function downloadFromExternalHost(url, outputDir, downloadedSet, logFn) {
async function downloadFromExternalHost(url, outputDir, downloadedSet, logFn, userAgent, fsSession) {
if (downloadedSet.has(url)) return 0;
downloadedSet.add(url);
// turbo.cr uses an obfuscated WASM player — gallery-dl can't extract the
// signed mp4 URL. Resolve via FlareSolverr (renders JS) instead.
if (isTurboUrl(url)) {
return await downloadTurbo(url, outputDir, logFn, userAgent, fsSession);
}
logFn(`Resolving via gallery-dl: ${url}`);
try {
@@ -240,23 +268,34 @@ async function downloadFromExternalHost(url, outputDir, downloadedSet, logFn) {
}
}
export async function scrapeForumPage(pageUrl, outputDir, downloadedSet, logFn, cookies) {
logFn(`Fetching page: ${pageUrl}`);
export async function scrapeForumPage(pageUrl, outputDir, downloadedSet, logFn, cookies, userAgent, fsSession) {
logFn(`Fetching page: ${pageUrl}${fsSession ? ' [via FlareSolverr]' : ''}`);
let html;
try {
const headers = { 'User-Agent': UA };
if (cookies) headers['Cookie'] = fixCookieIp(cookies);
const resp = await fetch(pageUrl, { headers, signal: AbortSignal.timeout(15000) });
if (!resp.ok) {
// SimpCity returns 404 for expired sessions, 403 for blocked
if (cookies && (resp.status === 404 || resp.status === 403)) {
throw new CookieExpiredError(resp.status);
if (fsSession) {
const r = await fsGet(fsSession, pageUrl, cookies);
if (r.status !== 200) {
if (cookies && (r.status === 404 || r.status === 403)) {
throw new CookieExpiredError(r.status);
}
logFn(`Failed to fetch page (${r.status})`);
return 0;
}
logFn(`Failed to fetch page (${resp.status})`);
return 0;
html = r.html;
} else {
const headers = { 'User-Agent': userAgent || UA };
if (cookies) headers['Cookie'] = fixCookieIp(cookies);
const resp = await fetch(pageUrl, { headers, signal: AbortSignal.timeout(15000) });
if (!resp.ok) {
if (cookies && (resp.status === 404 || resp.status === 403)) {
throw new CookieExpiredError(resp.status);
}
logFn(`Failed to fetch page (${resp.status})`);
return 0;
}
html = await resp.text();
}
html = await resp.text();
} catch (err) {
if (err instanceof CookieExpiredError) throw err;
logFn(`Failed to fetch page: ${err.message}`);
@@ -359,14 +398,14 @@ export async function scrapeForumPage(pageUrl, outputDir, downloadedSet, logFn,
// Download images
for (const imgUrl of imageUrls) {
if (await downloadImage(imgUrl, outputDir, downloadedSet, logFn, cookies)) {
if (await downloadImage(imgUrl, outputDir, downloadedSet, logFn, cookies, userAgent)) {
count++;
}
}
// Download from external hosts via gallery-dl
// Download from external hosts (turbo.cr handled via FlareSolverr; rest via gallery-dl)
for (const extUrl of externalUrls) {
const dlCount = await downloadFromExternalHost(extUrl, outputDir, downloadedSet, logFn);
const dlCount = await downloadFromExternalHost(extUrl, outputDir, downloadedSet, logFn, userAgent, fsSession);
count += dlCount;
}