Route SimpCity forum scraping through FlareSolverr + add turbo.cr resolver
DDoS-Guard now binds session cookies to the issuing browser's fingerprint, so direct Node fetch returns 403 even with valid cookies. Page HTML for any forum_site with stored cookies is now fetched via a FlareSolverr browser session opened once per scrape job. - Hybrid cookie refresh: FlareSolverr clears the DDoS-Guard captcha, those cookies seed undetected_chromedriver, Turnstile auto-solves in the real browser, login form submits, final cookies + browser UA persist to forum_sites - Per-site user_agent column so subsequent scraper requests match the UA the cookies were issued for (DDoS-Guard rejects UA mismatches) - XenForo search rewritten as proper CSRF POST /search/search → results page parse, replacing the broken ?q=... GET that only returned the search form - Pagination regex fallback in detectMaxPage catches XenForo pages that cheerio's class-based selectors miss - New scrapers/turbo.js handles turbo.cr /embed/ and /a/ URLs by rendering the page via FlareSolverr and grabbing the signed mp4 from the resolved <video src> attribute (gallery-dl can't extract these — obfuscated WASM) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,138 @@
|
||||
import { writeFileSync, existsSync } from 'fs';
|
||||
import { join, basename } from 'path';
|
||||
import { fsCreateSession, fsDestroySession, fsGet } from '../flaresolverr.js';
|
||||
import { upsertMediaFile } from '../db.js';
|
||||
|
||||
const TURBO_HOST_RE = /^https?:\/\/(?:www\.)?turbo\.\w+\//i;
|
||||
const TURBO_BASE = 'https://turbo.cr';
|
||||
const DEFAULT_UA = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36';
|
||||
|
||||
export function isTurboUrl(url) {
|
||||
return TURBO_HOST_RE.test(url);
|
||||
}
|
||||
|
||||
function unescapeHtml(s) {
|
||||
return s.replace(/&/g, '&').replace(/</g, '<').replace(/>/g, '>').replace(/"/g, '"');
|
||||
}
|
||||
|
||||
function extractMp4FromHtml(html) {
|
||||
// Plyr renders the resolved URL into <video src="..."> after WASM runs
|
||||
const m = html.match(/<video[^>]+\bsrc=["']([^"']+\.mp4[^"']*)["']/i);
|
||||
if (m) return unescapeHtml(m[1]);
|
||||
// Fallback: any direct turbocdn mp4
|
||||
const m2 = html.match(/https?:\/\/[^"'\s<>]*turbocdn[^"'\s<>]*\.mp4[^"'\s<>]*/i);
|
||||
if (m2) return unescapeHtml(m2[0]);
|
||||
return null;
|
||||
}
|
||||
|
||||
function turboFilename(mp4Url, fallbackId) {
|
||||
try {
|
||||
const u = new URL(mp4Url);
|
||||
const fn = u.searchParams.get('fn');
|
||||
if (fn) return fn;
|
||||
const base = basename(u.pathname);
|
||||
if (base) return base;
|
||||
} catch {}
|
||||
return (fallbackId || 'turbo') + '.mp4';
|
||||
}
|
||||
|
||||
async function downloadVideo(url, dest, ua) {
|
||||
try {
|
||||
const r = await fetch(url, {
|
||||
headers: { 'User-Agent': ua, 'Referer': TURBO_BASE + '/' },
|
||||
signal: AbortSignal.timeout(600000), // 10 min for big videos
|
||||
});
|
||||
if (!r.ok) return { ok: false, status: r.status };
|
||||
const buf = Buffer.from(await r.arrayBuffer());
|
||||
if (buf.length < 10000) return { ok: false, reason: 'too small ' + buf.length };
|
||||
writeFileSync(dest, buf);
|
||||
return { ok: true, size: buf.length };
|
||||
} catch (e) {
|
||||
return { ok: false, error: e.message };
|
||||
}
|
||||
}
|
||||
|
||||
async function resolveEmbed(sessionId, embedUrl) {
|
||||
const r = await fsGet(sessionId, embedUrl, '');
|
||||
if (r.status !== 200) return null;
|
||||
return extractMp4FromHtml(r.html);
|
||||
}
|
||||
|
||||
async function resolveAlbumIds(sessionId, albumUrl) {
|
||||
const r = await fsGet(sessionId, albumUrl, '');
|
||||
if (r.status !== 200) return [];
|
||||
// Album page: each video tile has data-id="<videoId>"
|
||||
const ids = new Set();
|
||||
const re = /data-id=["']([A-Za-z0-9_-]{6,})["']/g;
|
||||
let m;
|
||||
while ((m = re.exec(r.html)) !== null) ids.add(m[1]);
|
||||
// Also handle direct embed links if present
|
||||
const re2 = /turbo\.[a-z]+\/embed\/([A-Za-z0-9_-]+)/g;
|
||||
while ((m = re2.exec(r.html)) !== null) ids.add(m[1]);
|
||||
return [...ids];
|
||||
}
|
||||
|
||||
/**
|
||||
* Resolve a turbo.cr URL (embed or album) and download all videos found.
|
||||
* Pass an existing FlareSolverr sessionId to reuse it across many calls;
|
||||
* otherwise one is created and destroyed per call.
|
||||
*
|
||||
* Returns the count of videos successfully downloaded.
|
||||
*/
|
||||
export async function downloadTurbo(url, outputDir, logFn, userAgent, fsSession) {
|
||||
const ua = userAgent || DEFAULT_UA;
|
||||
const folderName = basename(outputDir);
|
||||
let ownSession = false;
|
||||
let sessionId = fsSession;
|
||||
|
||||
try {
|
||||
if (!sessionId) {
|
||||
sessionId = await fsCreateSession(TURBO_BASE + '/');
|
||||
ownSession = true;
|
||||
}
|
||||
|
||||
let embedIds = [];
|
||||
let mEmbed = url.match(/turbo\.[a-z]+\/embed\/([A-Za-z0-9_-]+)/i);
|
||||
if (mEmbed) {
|
||||
embedIds = [mEmbed[1]];
|
||||
} else if (/\/a\//i.test(url)) {
|
||||
logFn(`turbo: resolving album ${url}`);
|
||||
embedIds = await resolveAlbumIds(sessionId, url);
|
||||
logFn(`turbo: album has ${embedIds.length} video(s)`);
|
||||
} else {
|
||||
logFn(`turbo: unrecognized URL ${url}`);
|
||||
return 0;
|
||||
}
|
||||
|
||||
let count = 0;
|
||||
for (const id of embedIds) {
|
||||
const embedUrl = `${TURBO_BASE}/embed/${id}`;
|
||||
try {
|
||||
const mp4 = await resolveEmbed(sessionId, embedUrl);
|
||||
if (!mp4) {
|
||||
logFn(`turbo: could not resolve mp4 for ${id}`);
|
||||
continue;
|
||||
}
|
||||
const filename = turboFilename(mp4, id);
|
||||
const dest = join(outputDir, filename);
|
||||
if (existsSync(dest)) {
|
||||
logFn(`turbo: already have ${filename}`);
|
||||
continue;
|
||||
}
|
||||
const dl = await downloadVideo(mp4, dest, ua);
|
||||
if (dl.ok) {
|
||||
try { upsertMediaFile(folderName, filename, 'video', dl.size, Date.now(), null); } catch {}
|
||||
logFn(`Downloaded: ${filename} (${(dl.size / (1024 * 1024)).toFixed(1)} MB) [video]`);
|
||||
count++;
|
||||
} else {
|
||||
logFn(`turbo: download failed ${filename} - ${dl.status || dl.error || dl.reason}`);
|
||||
}
|
||||
} catch (e) {
|
||||
logFn(`turbo: error for ${id}: ${e.message}`);
|
||||
}
|
||||
}
|
||||
return count;
|
||||
} finally {
|
||||
if (ownSession && sessionId) await fsDestroySession(sessionId);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user