aa4f1157d1
DDoS-Guard now binds session cookies to the issuing browser's fingerprint, so direct Node fetch returns 403 even with valid cookies. Page HTML for any forum_site with stored cookies is now fetched via a FlareSolverr browser session opened once per scrape job. - Hybrid cookie refresh: FlareSolverr clears the DDoS-Guard captcha, those cookies seed undetected_chromedriver, Turnstile auto-solves in the real browser, login form submits, final cookies + browser UA persist to forum_sites - Per-site user_agent column so subsequent scraper requests match the UA the cookies were issued for (DDoS-Guard rejects UA mismatches) - XenForo search rewritten as proper CSRF POST /search/search → results page parse, replacing the broken ?q=... GET that only returned the search form - Pagination regex fallback in detectMaxPage catches XenForo pages that cheerio's class-based selectors miss - New scrapers/turbo.js handles turbo.cr /embed/ and /a/ URLs by rendering the page via FlareSolverr and grabbing the signed mp4 from the resolved <video src> attribute (gallery-dl can't extract these — obfuscated WASM) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
245 lines
8.4 KiB
JavaScript
245 lines
8.4 KiB
JavaScript
import { Router } from 'express';
|
|
import { exec } from 'child_process';
|
|
import { promisify } from 'util';
|
|
import path from 'path';
|
|
import { fileURLToPath } from 'url';
|
|
import { getForumSiteById, updateForumSite } from './db.js';
|
|
|
|
const execAsync = promisify(exec);
|
|
const router = Router();
|
|
const FLARESOLVERR_URL = process.env.FLARESOLVERR_URL || 'http://localhost:8191';
|
|
const CHROMIUM_PATH = process.env.CHROMIUM_PATH || '/usr/bin/chromium-browser';
|
|
|
|
const __filename = fileURLToPath(import.meta.url);
|
|
const __dirname = path.dirname(__filename);
|
|
|
|
async function fsCall(payload, timeoutMs = 130000) {
|
|
const resp = await fetch(`${FLARESOLVERR_URL}/v1`, {
|
|
method: 'POST',
|
|
headers: { 'Content-Type': 'application/json' },
|
|
body: JSON.stringify(payload),
|
|
signal: AbortSignal.timeout(timeoutMs),
|
|
});
|
|
const data = await resp.json();
|
|
if (data.status !== 'ok') {
|
|
throw new Error(`FlareSolverr error: ${data.message || JSON.stringify(data)}`);
|
|
}
|
|
return data;
|
|
}
|
|
|
|
function cookieArrayToString(cookies) {
|
|
return cookies.map(c => `${c.name}=${c.value}`).join('; ');
|
|
}
|
|
|
|
/**
|
|
* Create a FlareSolverr browser session and warm it up so DDoS-Guard /
|
|
* Cloudflare cookies are seeded for the target host. Returns a sessionId that
|
|
* must be passed to fsGet() and finally fsDestroySession().
|
|
*/
|
|
export async function fsCreateSession(warmUpUrl) {
|
|
const sess = await fsCall({ cmd: 'sessions.create' }, 60000);
|
|
if (warmUpUrl) {
|
|
try {
|
|
await fsCall({ cmd: 'request.get', url: warmUpUrl, session: sess.session, maxTimeout: 90000 });
|
|
} catch (e) {
|
|
console.warn(`[flaresolverr] Warm-up GET ${warmUpUrl} failed: ${e.message}`);
|
|
}
|
|
}
|
|
return sess.session;
|
|
}
|
|
|
|
export async function fsDestroySession(sessionId) {
|
|
if (!sessionId) return;
|
|
try {
|
|
await fsCall({ cmd: 'sessions.destroy', session: sessionId }, 30000);
|
|
} catch (e) {
|
|
console.warn(`[flaresolverr] Failed to destroy session ${sessionId}: ${e.message}`);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* GET a URL through a FlareSolverr session. The site's auth cookies (as a
|
|
* cookie-string) are merged into the session's cookie jar before navigation,
|
|
* so requests are authenticated without exposing fingerprintable headers.
|
|
* Returns { status, html, finalUrl }.
|
|
*/
|
|
export async function fsGet(sessionId, url, cookieStr, retries = 1) {
|
|
const cookies = (cookieStr || '').split(';').map(c => {
|
|
const [name, ...rest] = c.trim().split('=');
|
|
return { name: name.trim(), value: rest.join('=') };
|
|
}).filter(c => c.name && c.value && !c.name.startsWith('__ddg') && c.name !== 'ddg_last_challenge');
|
|
|
|
for (let i = 0; i <= retries; i++) {
|
|
try {
|
|
const r = await fsCall({
|
|
cmd: 'request.get',
|
|
url,
|
|
session: sessionId,
|
|
cookies,
|
|
maxTimeout: 90000,
|
|
});
|
|
if (r.solution && r.solution.status) {
|
|
return { status: r.solution.status, html: r.solution.response, finalUrl: r.solution.url };
|
|
}
|
|
} catch (e) {
|
|
if (i >= retries) throw e;
|
|
}
|
|
await new Promise(r => setTimeout(r, 3000));
|
|
}
|
|
return { status: 0, html: '', finalUrl: '' };
|
|
}
|
|
|
|
/**
|
|
* POST form-encoded data through a FlareSolverr session.
|
|
*/
|
|
export async function fsPost(sessionId, url, cookieStr, postData) {
|
|
const cookies = (cookieStr || '').split(';').map(c => {
|
|
const [name, ...rest] = c.trim().split('=');
|
|
return { name: name.trim(), value: rest.join('=') };
|
|
}).filter(c => c.name && c.value && !c.name.startsWith('__ddg') && c.name !== 'ddg_last_challenge');
|
|
|
|
const r = await fsCall({
|
|
cmd: 'request.post',
|
|
url,
|
|
session: sessionId,
|
|
cookies,
|
|
postData,
|
|
maxTimeout: 90000,
|
|
});
|
|
return { status: r.solution.status, html: r.solution.response, finalUrl: r.solution.url };
|
|
}
|
|
|
|
/**
|
|
* Refresh forum cookies via a hybrid flow:
|
|
* 1) FlareSolverr clears DDoS-Guard's "I'm not a robot" captcha and returns the
|
|
* __ddg* cookies in JSON form.
|
|
* 2) undetected_chromedriver is launched with those cookies pre-loaded, so it
|
|
* lands directly on the login page (skipping the captcha). Turnstile then
|
|
* auto-solves in the real browser context, the form is submitted, and we
|
|
* extract the final session cookies (including the user-identity cookie).
|
|
*
|
|
* This is the only flow we've found that handles both DDoS-Guard captcha and
|
|
* Cloudflare Turnstile without external paid services.
|
|
*/
|
|
export async function refreshForumCookies(siteId) {
|
|
const site = getForumSiteById(siteId);
|
|
if (!site) throw new Error(`Forum site ${siteId} not found`);
|
|
if (!site.username || !site.password) {
|
|
throw new Error('Forum site has no saved credentials — set username and password first');
|
|
}
|
|
|
|
const baseUrl = (site.base_url || 'https://simpcity.cr').replace(/\/$/, '');
|
|
const loginUrl = `${baseUrl}/login/`;
|
|
|
|
console.log(`[flaresolverr] Refreshing cookies for site ${siteId} (${site.name})`);
|
|
|
|
// Step 1: get DDoS-Guard cookies via FlareSolverr (no login attempt yet)
|
|
let ddgCookies = [];
|
|
let fsSessionId = null;
|
|
try {
|
|
const sess = await fsCall({ cmd: 'sessions.create' }, 60000);
|
|
fsSessionId = sess.session;
|
|
const getRes = await fsCall({
|
|
cmd: 'request.get',
|
|
url: baseUrl + '/',
|
|
session: fsSessionId,
|
|
maxTimeout: 120000,
|
|
});
|
|
ddgCookies = getRes.solution.cookies || [];
|
|
console.log(`[flaresolverr] DDoS-Guard cleared, got ${ddgCookies.length} cookies`);
|
|
} finally {
|
|
if (fsSessionId) {
|
|
try { await fsCall({ cmd: 'sessions.destroy', session: fsSessionId }, 30000); } catch {}
|
|
}
|
|
}
|
|
|
|
if (ddgCookies.length === 0) {
|
|
throw new Error('FlareSolverr returned no cookies — DDoS-Guard not bypassed');
|
|
}
|
|
|
|
// Step 2: launch chromedriver with pre-loaded cookies + perform login
|
|
const helperPath = path.join(__dirname, 'login_helper.py');
|
|
const cookiesJson = JSON.stringify(ddgCookies);
|
|
|
|
// Escape arguments for shell safety
|
|
const esc = (s) => s.replace(/'/g, "'\\''");
|
|
const cmd = `xvfb-run --auto-servernum --server-args='-screen 0 1920x1080x24' python3 -u '${helperPath}' '${esc(loginUrl)}' '${esc(site.username)}' '${esc(site.password)}' '${esc(cookiesJson)}'`;
|
|
|
|
try {
|
|
const { stdout, stderr } = await execAsync(cmd, {
|
|
timeout: 180000,
|
|
maxBuffer: 10 * 1024 * 1024,
|
|
env: { ...process.env, CHROMIUM_PATH },
|
|
});
|
|
|
|
if (stderr) {
|
|
for (const line of stderr.split('\n').filter(Boolean)) {
|
|
console.log(`[flaresolverr] ${line}`);
|
|
}
|
|
}
|
|
|
|
const result = JSON.parse(stdout.trim());
|
|
if (!result.ok) throw new Error(result.error || 'Login failed');
|
|
|
|
const expiresAt = new Date(Date.now() + 25 * 24 * 60 * 60 * 1000).toISOString();
|
|
updateForumSite(siteId, {
|
|
cookies: result.cookies,
|
|
cookie_expires_at: expiresAt,
|
|
user_agent: result.user_agent || '',
|
|
});
|
|
|
|
console.log(`[flaresolverr] Cookie refresh successful for site ${siteId} (UA: ${result.user_agent || 'default'})`);
|
|
return result.cookies;
|
|
} catch (err) {
|
|
if (err.stderr) {
|
|
for (const line of err.stderr.split('\n').filter(Boolean)) {
|
|
console.error(`[flaresolverr] ${line}`);
|
|
}
|
|
}
|
|
if (err.stdout) {
|
|
try {
|
|
const result = JSON.parse(err.stdout.trim());
|
|
if (result.error) throw new Error(result.error);
|
|
} catch {}
|
|
}
|
|
throw new Error(`Cookie refresh failed: ${err.message}`);
|
|
}
|
|
}
|
|
|
|
// --- API Endpoints ---
|
|
|
|
// Manual cookie refresh
|
|
router.post('/api/flaresolverr/refresh/:siteId', async (req, res) => {
|
|
const siteId = parseInt(req.params.siteId, 10);
|
|
try {
|
|
const cookieStr = await refreshForumCookies(siteId);
|
|
res.json({ ok: true, cookies: cookieStr });
|
|
} catch (err) {
|
|
console.error(`[flaresolverr] Refresh failed for site ${siteId}:`, err.message);
|
|
res.status(500).json({ error: err.message });
|
|
}
|
|
});
|
|
|
|
// Check if cookie refresh is available (Chromium + xvfb-run installed)
|
|
router.get('/api/flaresolverr/status', async (_req, res) => {
|
|
try {
|
|
// Check for xvfb-run and chromium
|
|
await execAsync('which xvfb-run && which chromium-browser || which chromium', { timeout: 5000 });
|
|
// Check for undetected_chromedriver python package
|
|
await execAsync('python3 -c "import undetected_chromedriver"', { timeout: 5000 });
|
|
res.json({ available: true });
|
|
} catch {
|
|
// Fallback: check FlareSolverr service
|
|
try {
|
|
const resp = await fetch(`${FLARESOLVERR_URL}/health`, {
|
|
signal: AbortSignal.timeout(5000),
|
|
});
|
|
res.json({ available: resp.ok });
|
|
} catch {
|
|
res.json({ available: false, error: 'Neither undetected_chromedriver nor FlareSolverr available' });
|
|
}
|
|
}
|
|
});
|
|
|
|
export default router;
|